diff --git "a/debug/debug-final.py" "b/debug/debug-final.py" new file mode 100644--- /dev/null +++ "b/debug/debug-final.py" @@ -0,0 +1,24446 @@ +# from tvm.script import ir as I +# from tvm.script import tir as T +# from tvm.script import relax as R + +@I.ir_module +class Module: + I.module_attrs({"external_mods": [metadata["runtime.Module"][0], metadata["runtime.Module"][1], metadata["runtime.Module"][2], metadata["runtime.Module"][3], metadata["runtime.Module"][4], metadata["runtime.Module"][5], metadata["runtime.Module"][6], metadata["runtime.Module"][7], metadata["runtime.Module"][8], metadata["runtime.Module"][9], metadata["runtime.Module"][10], metadata["runtime.Module"][11], metadata["runtime.Module"][12], metadata["runtime.Module"][13], metadata["runtime.Module"][14]]}) + @T.prim_func + def NT_matmul(layer_norm356: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), NT_matmul: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + NT_matmul_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + NT_matmul_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + model_decoder_layers_0_self_attn_q_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local") + layer_norm356_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") + for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): + for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): + for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_3 in T.vectorized(T.int64(1)): + with T.block("layer_norm356_shared"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3) + T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280)) + T.reads(layer_norm356[v0, v1, v2]) + T.writes(layer_norm356_shared[v0, v1, v2]) + layer_norm356_shared[v0, v1, v2] = layer_norm356[v0, v1, v2] + for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) + T.reads() + T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax0_ax1_fused_0 in range(T.int64(4)): + for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): + with T.block("model_decoder_layers_0_self_attn_q_proj_weight5_local"): + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) + v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) + T.reads(model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1]) + T.writes(model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1]) + model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1] + for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) + vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) + T.reads(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) + T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] + for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax2_fused_2_1 in T.vectorized(T.int64(1)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads() + T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1 in range(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] + for ax1_fused_2 in range(T.int64(1)): + for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + with T.block("NT_matmul"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) + T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul[T.int64(0), T.int64(0), v0]) + with T.init(): + NT_matmul[T.int64(0), T.int64(0), v0] = T.float16(0) + NT_matmul[T.int64(0), T.int64(0), v0] = NT_matmul[T.int64(0), T.int64(0), v0] + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + + @T.prim_func + def NT_matmul3(layer_norm452: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_embed_tokens_weight5: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), NT_matmul: T.Buffer((T.int64(1), T.int64(1), T.int64(51866)), "float32")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + NT_matmul_rf_local = T.alloc_buffer((T.int64(256), T.int64(1), T.int64(1), T.int64(51866)), scope="local") + NT_matmul_rf_local_1 = T.alloc_buffer((T.int64(64), T.int64(1), T.int64(1), T.int64(51866)), scope="local") + model_decoder_embed_tokens_weight5_local = T.alloc_buffer((T.int64(51866), T.int64(1280)), "float16", scope="local") + layer_norm452_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") + for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(12967), thread="blockIdx.x"): + for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): + for ax2_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax2_2 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + for ax2_3 in T.vectorized(T.int64(1)): + with T.block("layer_norm452_shared"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1 * T.int64(64) + ax2_2 + ax2_3) + T.reads(layer_norm452[v0, v1, v2]) + T.writes(layer_norm452_shared[v0, v1, v2]) + layer_norm452_shared[v0, v1, v2] = layer_norm452[v0, v1, v2] + for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) + v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) + T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init < T.int64(51866)) + T.reads() + T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float32(0) + for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax0_ax1_fused_0 in range(T.int64(2)): + for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): + with T.block("model_decoder_embed_tokens_weight5_local"): + v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1) + v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) + T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 < T.int64(51866)) + T.reads(model_decoder_embed_tokens_weight5[v0, v1]) + T.writes(model_decoder_embed_tokens_weight5_local[v0, v1]) + model_decoder_embed_tokens_weight5_local[v0, v1] = model_decoder_embed_tokens_weight5[v0, v1] + for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) + v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) + vax1_fused_u_fused_2, vax1_fused_u_fused_0 = T.axis.remap("RR", [ax1_fused_u_fused_2, ax1_fused_u_fused_0]) + T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2 < T.int64(51866)) + T.reads(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm452_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused], model_decoder_embed_tokens_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) + T.writes(NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + T.Cast("float32", layer_norm452_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) * T.Cast("float32", model_decoder_embed_tokens_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) + for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax2_fused_2_1 in T.vectorized(T.int64(1)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(64), ax0) + v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax2_fused_0_ax2_fused_1_fused % T.int64(4) + (ax2_fused_2_0 + ax2_fused_2_1)) < T.int64(51866)) + T.reads() + T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float32(0) + for ax1 in range(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) + v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax2_fused_0_ax2_fused_1_fused % T.int64(4) + (ax2_fused_2_0 + ax2_fused_2_1)) < T.int64(51866)) + T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] + for ax1_fused_2 in range(T.int64(1)): + for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + with T.block("NT_matmul"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(64), ax0) + v0 = T.axis.spatial(T.int64(51866), u_fused_ax0_fused_fused_0 * T.int64(4) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) + T.where(u_fused_ax0_fused_fused_0 * T.int64(4) + (T.Mul(T.int64(0), T.int64(4)) + ax1_fused_0_ax1_fused_1_fused % T.int64(4) + ax1_fused_2) < T.int64(51866)) + T.reads(NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul[T.int64(0), T.int64(0), v0]) + with T.init(): + NT_matmul[T.int64(0), T.int64(0), v0] = T.float32(0) + NT_matmul[T.int64(0), T.int64(0), v0] = NT_matmul[T.int64(0), T.int64(0), v0] + NT_matmul_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + + @T.prim_func + def add(var_reshape708: T.handle, var_reshape709: T.handle, var_T_add: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + reshape708 = T.match_buffer(var_reshape708, (batch_size, T.int64(1), T.int64(1280)), "float16") + reshape709 = T.match_buffer(var_reshape709, (batch_size, T.int64(1), T.int64(1280)), "float16") + T_add = T.match_buffer(var_T_add, (batch_size, T.int64(1), T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_add"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) + T.reads(reshape708[v0, T.int64(0), v1], reshape709[v0, T.int64(0), v1]) + T.writes(T_add[v0, T.int64(0), v1]) + T_add[v0, T.int64(0), v1] = reshape708[v0, T.int64(0), v1] + reshape709[v0, T.int64(0), v1] + + @T.prim_func + def add4(var_add: T.handle, var_lv610: T.handle, var_T_add: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + add = T.match_buffer(var_add, (batch_size, T.int64(1500), T.int64(1280)), "float16") + lv610 = T.match_buffer(var_lv610, (batch_size, T.int64(1500), T.int64(1280)), "float16") + T_add = T.match_buffer(var_T_add, (batch_size, T.int64(1500), T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_add"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) + v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) + v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) + T.reads(add[v0, v1, v2], lv610[v0, v1, v2]) + T.writes(T_add[v0, v1, v2]) + T_add[v0, v1, v2] = add[v0, v1, v2] + lv610[v0, v1, v2] + + @T.prim_func + def add5(var_reshape385: T.handle, var_reshape386: T.handle, var_T_add: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + reshape385 = T.match_buffer(var_reshape385, (T.int64(1), seq_len, T.int64(1280)), "float16") + reshape386 = T.match_buffer(var_reshape386, (T.int64(1), seq_len, T.int64(1280)), "float16") + T_add = T.match_buffer(var_T_add, (T.int64(1), seq_len, T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_add"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280)) + T.reads(reshape385[T.int64(0), v0, v1], reshape386[T.int64(0), v0, v1]) + T.writes(T_add[T.int64(0), v0, v1]) + T_add[T.int64(0), v0, v1] = reshape385[T.int64(0), v0, v1] + reshape386[T.int64(0), v0, v1] + + @T.prim_func + def apply_bitmask_inplace(var_logits: T.handle, var_seq_ids: T.handle, var_bitmask: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) + logits = T.match_buffer(var_logits, (batch_size, vocab_size)) + num_seq = T.int32(is_size_var=True) + seq_ids = T.match_buffer(var_seq_ids, (num_seq,), "int32") + bitmask = T.match_buffer(var_bitmask, (batch_size, (vocab_size + 31) // 32), "int32") + # with T.block("root"): + for fused_s_v_0 in T.thread_binding((num_seq * vocab_size + 1023) // 1024, thread="blockIdx.x"): + for fused_s_v_1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("block"): + vs = T.axis.spatial(num_seq, (fused_s_v_0 * 1024 + fused_s_v_1) // vocab_size) + vv = T.axis.spatial(vocab_size, (fused_s_v_0 * 1024 + fused_s_v_1) % vocab_size) + T.where(fused_s_v_0 * 1024 + fused_s_v_1 < num_seq * vocab_size) + T.reads(bitmask[seq_ids[vs], vv // 32], seq_ids[vs], logits[seq_ids[vs], vv]) + T.writes(logits[seq_ids[vs], vv]) + logits[seq_ids[vs], vv] = T.if_then_else(T.bitwise_and(T.shift_right(bitmask[seq_ids[vs], vv // 32], vv % 32), 1) == 1, logits[seq_ids[vs], vv], T.float32(-3.4028234663852886e+38)) + + @T.prim_func + def apply_logit_bias_inplace(var_logits: T.handle, var_pos2seq_id: T.handle, var_token_ids: T.handle, var_logit_bias: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) + logits = T.match_buffer(var_logits, (batch_size, vocab_size)) + num_token = T.int32(is_size_var=True) + pos2seq_id = T.match_buffer(var_pos2seq_id, (num_token,), "int32") + token_ids = T.match_buffer(var_token_ids, (num_token,), "int32") + logit_bias = T.match_buffer(var_logit_bias, (num_token,)) + # with T.block("root"): + for p0 in T.thread_binding((num_token + 1023) // 1024, thread="blockIdx.x"): + for p1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("block"): + vp = T.axis.spatial(num_token, p0 * 1024 + p1) + T.where(p0 * 1024 + p1 < num_token) + T.reads(logits[pos2seq_id[vp], token_ids[vp]], pos2seq_id[vp], token_ids[vp], logit_bias[vp]) + T.writes(logits[pos2seq_id[vp], token_ids[vp]]) + logits[pos2seq_id[vp], token_ids[vp]] = logits[pos2seq_id[vp], token_ids[vp]] + logit_bias[vp] + + @T.prim_func + def apply_penalty_inplace(var_logits: T.handle, var_seq_ids: T.handle, var_pos2seq_id: T.handle, var_token_ids: T.handle, var_token_cnt: T.handle, var_penalties: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) + logits = T.match_buffer(var_logits, (batch_size, vocab_size)) + num_seq = T.int32(is_size_var=True) + seq_ids = T.match_buffer(var_seq_ids, (num_seq,), "int32") + num_token = T.int32(is_size_var=True) + pos2seq_id = T.match_buffer(var_pos2seq_id, (num_token,), "int32") + token_ids = T.match_buffer(var_token_ids, (num_token,), "int32") + token_cnt = T.match_buffer(var_token_cnt, (num_token,), "int32") + penalties = T.match_buffer(var_penalties, (num_seq, 3)) + # with T.block("root"): + for p0 in T.thread_binding((num_token + 1023) // 1024, thread="blockIdx.x"): + for p1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("block"): + vp = T.axis.spatial(num_token, p0 * 1024 + p1) + T.where(p0 * 1024 + p1 < num_token) + T.reads(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]], seq_ids[pos2seq_id[vp]], pos2seq_id[vp], token_ids[vp], penalties[pos2seq_id[vp], 0:3], token_cnt[vp]) + T.writes(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]]) + logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] = logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] - (penalties[pos2seq_id[vp], 0] + T.Cast("float32", token_cnt[vp]) * penalties[pos2seq_id[vp], 1]) + logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] = T.if_then_else(logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] > T.float32(0), logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] * penalties[pos2seq_id[vp], 2], logits[seq_ids[pos2seq_id[vp]], token_ids[vp]] / penalties[pos2seq_id[vp], 2]) + + @T.prim_func + def argsort_thrust(var_probs: T.handle, var_lv: T.handle, var_topk_gpu_v1: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int64(), T.int64() + data_buf = T.match_buffer(var_probs, (batch_size, vocab_size), align=8) + workspace_buf = T.match_buffer(var_lv, (T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12),), "uint8", align=8) + indices_buf = T.match_buffer(var_topk_gpu_v1, (batch_size, vocab_size), "int32", align=8) + # with T.block("root"): + value_buf = T.alloc_buffer((batch_size, vocab_size), align=8) + with T.block("topk_gpu"): + T.reads() + T.writes() + T.call_packed("tvm.contrib.thrust.sort", T.tvm_stack_make_array(data_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(value_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(indices_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, 0, T.int64(0)), 0, T.tvm_stack_make_array(workspace_buf.data, T.tvm_stack_make_shape(T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12)), 0, 1, T.uint8(0), T.int64(0))) + + @T.prim_func + def batch_decode_paged_kv(_0: T.int32, Q_handle: T.handle, pages_handle: T.handle, page_table_indptr_handle: T.handle, page_table_values_handle: T.handle, var_length_info: T.handle, k_rope_pos_offset_handle: T.handle, q_rope_position_handle: T.handle, output_handle: T.handle, lse_handle: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + B = T.int32(is_size_var=True) + Q = T.match_buffer(Q_handle, (B, 20, 64), "float16") + max_num_pages = T.int32(is_size_var=True) + pages = T.match_buffer(pages_handle, (max_num_pages, 2, 20, 16, 64), "float16") + page_table_indptr = T.match_buffer(page_table_indptr_handle, (B + 1,), "int32", offset_factor=1) + nnz_pages = T.int32(is_size_var=True) + page_table_values = T.match_buffer(page_table_values_handle, (nnz_pages,), "int32", offset_factor=1) + length_info = T.match_buffer(var_length_info, (B,), "int32", offset_factor=1) + k_rope_pos_offset = T.match_buffer(k_rope_pos_offset_handle, (B,), "int32", offset_factor=1) + q_rope_position = T.match_buffer(q_rope_position_handle, (B,), "int32", offset_factor=1) + output = T.match_buffer(output_handle, (B, 20, 64), "float16") + lse = T.match_buffer(lse_handle, (B, 20)) + # with T.block("root"): + sm_scale: T.float32 = T.float32(0.18033688011112042) + for bx in T.thread_binding(B, thread="blockIdx.x"): + for fused_by_bz in T.thread_binding(20, thread="blockIdx.y"): + for ty in T.thread_binding(1, thread="threadIdx.y"): + for tx in T.thread_binding(16, thread="threadIdx.x"): + for tz in T.thread_binding(32, thread="threadIdx.z"): + with T.block("attn"): + T.reads(page_table_indptr[bx:bx + 2], length_info[bx], q_rope_position[bx], Q[bx, fused_by_bz // 20 + ty + fused_by_bz % 20, tx * 4 - 32:tx * 4 - 32 + 68]) + T.writes(output[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty, tx * 4:tx * 4 + 4], lse[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty]) + Q_local = T.alloc_buffer((4,), "float16", scope="local") + kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") + K_smem = T.alloc_buffer((64, 64), "float16", scope="shared") + V_smem = T.alloc_buffer((64, 64), "float16", scope="shared") + O_allreduce = T.alloc_buffer((32, 1, 64), scope="shared") + md_allreduce = T.alloc_buffer((32, 1, 2), scope="shared") + S_reduce_local = T.alloc_buffer((1,), scope="local") + t0 = T.alloc_buffer((1,), scope="local") + S_local = T.alloc_buffer((2,), scope="local") + QK_local = T.alloc_buffer((4,), scope="local") + V_local = T.alloc_buffer((4,), "float16", scope="local") + m_prev = T.alloc_buffer((1,), scope="local") + d_prev = T.alloc_buffer((1,), scope="local") + other_m = T.alloc_buffer((1,), scope="local") + other_d = T.alloc_buffer((1,), scope="local") + exp_mprev = T.alloc_buffer((1,), scope="local") + exp_otherm = T.alloc_buffer((1,), scope="local") + other_o = T.alloc_buffer((4,), scope="local") + st_m = T.alloc_buffer((1,), scope="local") + st_d = T.alloc_buffer((1,), scope="local") + O_local = T.alloc_buffer((4,), scope="local") + by: T.int32 = fused_by_bz % 20 + bz: T.int32 = fused_by_bz // 20 + batch_idx: T.int32 = bx + cur_page_indptr_begin: T.int32 = page_table_indptr[batch_idx] + cur_page_indptr_end: T.int32 = page_table_indptr[batch_idx + 1] + kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[batch_idx], 0) + st_m[0] = T.float32(-50000) + st_d[0] = T.float32(1) + for vec in T.vectorized(4): + O_local[vec] = T.float32(0) + for vec in T.vectorized(4): + Q_local[vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", Q[bx, by + bz + ty, tx * 4 + vec]) + T.sin(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, Q[bx, by + bz + ty, tx * 4 + vec + 32] * T.float16(-1), Q[bx, by + bz + ty, tx * 4 + vec - 32]))), Q[bx, by + bz + ty, tx * 4 + vec]) + for iterator in range((kv_chunk_len[0] + 63) // 64): + tile_start_s: T.int32 = (tz + ty) * 2 + tile_start_g: T.int32 = (iterator * 32 + tz + ty) * 2 + for j in range(2): + with T.block("KV_load"): + T.reads() + T.writes() + row_g: T.int32 = tile_start_g + j + if row_g < kv_chunk_len[0]: + seq_offset: T.int32 = row_g + page_no: T.int32 = page_table_values[cur_page_indptr_begin + seq_offset // 16] + page_offset: T.int32 = seq_offset % 16 + for vec in T.vectorized(4): + K_smem[tile_start_s + j, tx * 4 + vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, tx * 4 + vec]) + T.sin(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, pages[page_no, 0, by, page_offset, tx * 4 + vec + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, tx * 4 + vec - 32]))), pages[page_no, 0, by, page_offset, tx * 4 + vec]) + V_smem[tile_start_s + j, tx * 4 + vec] = pages[page_no, 1, by, page_offset, tx * 4 + vec] + else: + for vec in T.vectorized(4): + K_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) + V_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) + T.tvm_storage_sync("shared") + m_prev[0] = st_m[0] + for j in range(2): + for vec in T.vectorized(4): + QK_local[vec] = T.Cast("float32", Q_local[vec]) * T.Cast("float32", K_smem[tz * 2 + j, tx * 4 + vec]) * attn_score_scaling_factor * sm_scale + S_reduce_local[0] = T.float32(0) + for vec in T.unroll(4): + S_reduce_local[0] = S_reduce_local[0] + QK_local[vec] + with T.block("block_cross_thread"): + T.reads(S_reduce_local[0]) + T.writes(t0[0]) + T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) + T.tvm_thread_allreduce(T.uint32(1), S_reduce_local[0], T.bool(True), t0[0], tx) + S_local[j] = T.float32(-50000) + if (iterator * 32 + tz) * 2 + j < kv_chunk_len[0]: + S_local[j] = t0[0] + st_m[0] = T.max(st_m[0], S_local[j]) + o_scale: T.float32 = T.exp2(m_prev[0] - st_m[0]) + st_d[0] = st_d[0] * o_scale + for j in range(2): + S_local[j] = T.exp2(S_local[j] - st_m[0]) + st_d[0] = st_d[0] + S_local[j] + for j in T.vectorized(4): + O_local[j] = O_local[j] * o_scale + for j in range(2): + for vec in T.vectorized(4): + V_local[vec] = V_smem[tz * 2 + j, tx * 4 + vec] + for vec in T.vectorized(4): + O_local[vec] = O_local[vec] + T.Cast("float32", V_local[vec]) * S_local[j] + for vec in T.vectorized(4): + O_allreduce[tz, ty, tx * 4 + vec] = O_local[vec] + md_allreduce[tz, ty, 0] = st_m[0] + md_allreduce[tz, ty, 1] = st_d[0] + T.tvm_storage_sync("shared") + st_m[0] = T.float32(-50000) + st_d[0] = T.float32(1) + for vec in T.vectorized(4): + O_local[vec] = T.float32(0) + for j in range(32): + m_prev[0] = st_m[0] + d_prev[0] = st_d[0] + other_m[0] = md_allreduce[j, ty, 0] + other_d[0] = md_allreduce[j, ty, 1] + for vec in T.vectorized(4): + other_o[vec] = O_allreduce[j, ty, tx * 4 + vec] + st_m[0] = T.max(st_m[0], other_m[0]) + st_d[0] = d_prev[0] * T.exp2(m_prev[0] - st_m[0]) + other_d[0] * T.exp2(other_m[0] - st_m[0]) + exp_mprev[0] = T.exp2(m_prev[0] - st_m[0]) + exp_otherm[0] = T.exp2(other_m[0] - st_m[0]) + for vec in T.vectorized(4): + O_local[vec] = O_local[vec] * exp_mprev[0] + other_o[vec] * exp_otherm[0] + for vec in T.vectorized(4): + O_local[vec] = O_local[vec] / st_d[0] + for vec in T.vectorized(4): + output[batch_idx, by + bz + ty, tx * 4 + vec] = T.Cast("float16", O_local[vec]) + lse[batch_idx, by + bz + ty] = st_m[0] + T.log2(st_d[0]) + + @T.prim_func + def batch_decode_paged_kv_sliding_window(_0: T.int32, Q_handle: T.handle, pages_handle: T.handle, page_table_indptr_handle: T.handle, page_table_values_handle: T.handle, var_length_info: T.handle, k_rope_pos_offset_handle: T.handle, q_rope_position_handle: T.handle, output_handle: T.handle, lse_handle: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + B = T.int32(is_size_var=True) + Q = T.match_buffer(Q_handle, (B, 20, 64), "float16") + max_num_pages = T.int32(is_size_var=True) + pages = T.match_buffer(pages_handle, (max_num_pages, 2, 20, 16, 64), "float16") + page_table_indptr = T.match_buffer(page_table_indptr_handle, (B + 1,), "int32", offset_factor=1) + nnz_pages = T.int32(is_size_var=True) + page_table_values = T.match_buffer(page_table_values_handle, (nnz_pages,), "int32", offset_factor=1) + length_info = T.match_buffer(var_length_info, (3, B), "int32", offset_factor=1) + k_rope_pos_offset = T.match_buffer(k_rope_pos_offset_handle, (B,), "int32", offset_factor=1) + q_rope_position = T.match_buffer(q_rope_position_handle, (B,), "int32", offset_factor=1) + output = T.match_buffer(output_handle, (B, 20, 64), "float16") + lse = T.match_buffer(lse_handle, (B, 20)) + # with T.block("root"): + sm_scale: T.float32 = T.float32(0.18033688011112042) + for bx in T.thread_binding(B, thread="blockIdx.x"): + for fused_by_bz in T.thread_binding(20, thread="blockIdx.y"): + for ty in T.thread_binding(1, thread="threadIdx.y"): + for tx in T.thread_binding(16, thread="threadIdx.x"): + for tz in T.thread_binding(32, thread="threadIdx.z"): + with T.block("attn"): + T.reads(page_table_indptr[bx:bx + 2], length_info[0:3, bx], q_rope_position[bx], Q[bx, fused_by_bz // 20 + ty + fused_by_bz % 20, tx * 4 - 32:tx * 4 - 32 + 68]) + T.writes(output[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty, tx * 4:tx * 4 + 4], lse[bx, fused_by_bz % 20 + fused_by_bz // 20 + ty]) + Q_local = T.alloc_buffer((4,), "float16", scope="local") + kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") + K_smem = T.alloc_buffer((64, 64), "float16", scope="shared") + V_smem = T.alloc_buffer((64, 64), "float16", scope="shared") + O_allreduce = T.alloc_buffer((32, 1, 64), scope="shared") + md_allreduce = T.alloc_buffer((32, 1, 2), scope="shared") + S_reduce_local = T.alloc_buffer((1,), scope="local") + t0 = T.alloc_buffer((1,), scope="local") + S_local = T.alloc_buffer((2,), scope="local") + QK_local = T.alloc_buffer((4,), scope="local") + V_local = T.alloc_buffer((4,), "float16", scope="local") + m_prev = T.alloc_buffer((1,), scope="local") + d_prev = T.alloc_buffer((1,), scope="local") + other_m = T.alloc_buffer((1,), scope="local") + other_d = T.alloc_buffer((1,), scope="local") + exp_mprev = T.alloc_buffer((1,), scope="local") + exp_otherm = T.alloc_buffer((1,), scope="local") + other_o = T.alloc_buffer((4,), scope="local") + st_m = T.alloc_buffer((1,), scope="local") + st_d = T.alloc_buffer((1,), scope="local") + O_local = T.alloc_buffer((4,), scope="local") + by: T.int32 = fused_by_bz % 20 + bz: T.int32 = fused_by_bz // 20 + batch_idx: T.int32 = bx + cur_page_indptr_begin: T.int32 = page_table_indptr[batch_idx] + cur_page_indptr_end: T.int32 = page_table_indptr[batch_idx + 1] + kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[0, batch_idx] - length_info[1, batch_idx] + length_info[2, batch_idx], 0) + st_m[0] = T.float32(-50000) + st_d[0] = T.float32(1) + for vec in T.vectorized(4): + O_local[vec] = T.float32(0) + for vec in T.vectorized(4): + Q_local[vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", Q[bx, by + bz + ty, tx * 4 + vec]) + T.sin(T.Cast("float32", q_rope_position[batch_idx]) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, Q[bx, by + bz + ty, tx * 4 + vec + 32] * T.float16(-1), Q[bx, by + bz + ty, tx * 4 + vec - 32]))), Q[bx, by + bz + ty, tx * 4 + vec]) + for iterator in range((kv_chunk_len[0] + 63) // 64): + tile_start_s: T.int32 = (tz + ty) * 2 + tile_start_g: T.int32 = (iterator * 32 + tz + ty) * 2 + for j in range(2): + with T.block("KV_load"): + T.reads() + T.writes() + row_g: T.int32 = tile_start_g + j + if row_g < kv_chunk_len[0]: + seq_offset: T.int32 = T.if_then_else(row_g < length_info[2, batch_idx], row_g, row_g - length_info[2, batch_idx] + length_info[1, batch_idx]) + page_no: T.int32 = page_table_values[cur_page_indptr_begin + seq_offset // 16] + page_offset: T.int32 = seq_offset % 16 + for vec in T.vectorized(4): + K_smem[tile_start_s + j, tx * 4 + vec] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, tx * 4 + vec]) + T.sin(T.Cast("float32", k_rope_pos_offset[batch_idx] + row_g) * rope_scale / T.pow(rope_theta, T.Cast("float32", (tx * 4 + vec) * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(tx * 4 + vec < 32, pages[page_no, 0, by, page_offset, tx * 4 + vec + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, tx * 4 + vec - 32]))), pages[page_no, 0, by, page_offset, tx * 4 + vec]) + V_smem[tile_start_s + j, tx * 4 + vec] = pages[page_no, 1, by, page_offset, tx * 4 + vec] + else: + for vec in T.vectorized(4): + K_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) + V_smem[tile_start_s + j, tx * 4 + vec] = T.float16(0) + T.tvm_storage_sync("shared") + m_prev[0] = st_m[0] + for j in range(2): + for vec in T.vectorized(4): + QK_local[vec] = T.Cast("float32", Q_local[vec]) * T.Cast("float32", K_smem[tz * 2 + j, tx * 4 + vec]) * attn_score_scaling_factor * sm_scale + S_reduce_local[0] = T.float32(0) + for vec in T.unroll(4): + S_reduce_local[0] = S_reduce_local[0] + QK_local[vec] + with T.block("block_cross_thread"): + T.reads(S_reduce_local[0]) + T.writes(t0[0]) + T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) + T.tvm_thread_allreduce(T.uint32(1), S_reduce_local[0], T.bool(True), t0[0], tx) + S_local[j] = T.float32(-50000) + if (iterator * 32 + tz) * 2 + j < kv_chunk_len[0]: + S_local[j] = t0[0] + st_m[0] = T.max(st_m[0], S_local[j]) + o_scale: T.float32 = T.exp2(m_prev[0] - st_m[0]) + st_d[0] = st_d[0] * o_scale + for j in range(2): + S_local[j] = T.exp2(S_local[j] - st_m[0]) + st_d[0] = st_d[0] + S_local[j] + for j in T.vectorized(4): + O_local[j] = O_local[j] * o_scale + for j in range(2): + for vec in T.vectorized(4): + V_local[vec] = V_smem[tz * 2 + j, tx * 4 + vec] + for vec in T.vectorized(4): + O_local[vec] = O_local[vec] + T.Cast("float32", V_local[vec]) * S_local[j] + for vec in T.vectorized(4): + O_allreduce[tz, ty, tx * 4 + vec] = O_local[vec] + md_allreduce[tz, ty, 0] = st_m[0] + md_allreduce[tz, ty, 1] = st_d[0] + T.tvm_storage_sync("shared") + st_m[0] = T.float32(-50000) + st_d[0] = T.float32(1) + for vec in T.vectorized(4): + O_local[vec] = T.float32(0) + for j in range(32): + m_prev[0] = st_m[0] + d_prev[0] = st_d[0] + other_m[0] = md_allreduce[j, ty, 0] + other_d[0] = md_allreduce[j, ty, 1] + for vec in T.vectorized(4): + other_o[vec] = O_allreduce[j, ty, tx * 4 + vec] + st_m[0] = T.max(st_m[0], other_m[0]) + st_d[0] = d_prev[0] * T.exp2(m_prev[0] - st_m[0]) + other_d[0] * T.exp2(other_m[0] - st_m[0]) + exp_mprev[0] = T.exp2(m_prev[0] - st_m[0]) + exp_otherm[0] = T.exp2(other_m[0] - st_m[0]) + for vec in T.vectorized(4): + O_local[vec] = O_local[vec] * exp_mprev[0] + other_o[vec] * exp_otherm[0] + for vec in T.vectorized(4): + O_local[vec] = O_local[vec] / st_d[0] + for vec in T.vectorized(4): + output[batch_idx, by + bz + ty, tx * 4 + vec] = T.Cast("float16", O_local[vec]) + lse[batch_idx, by + bz + ty] = st_m[0] + T.log2(st_d[0]) + + @T.prim_func + def batch_prefill_paged_kv(_0: T.int32, var_q: T.handle, var_q_indptr: T.handle, var_pages: T.handle, var_page_indptr: T.handle, var_page_values: T.handle, var_length_info: T.handle, var_k_rope_pos_offset: T.handle, var_q_rope_position: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + total_len = T.int32(is_size_var=True) + q = T.match_buffer(var_q, (total_len, 20, 64), "float16") + batch_size = T.int32(is_size_var=True) + q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) + max_num_pages = T.int32(is_size_var=True) + pages = T.match_buffer(var_pages, (max_num_pages, 2, 20, 16, 64), "float16") + page_indptr = T.match_buffer(var_page_indptr, (batch_size + 1,), "int32", offset_factor=1) + nnz_pages = T.int32(is_size_var=True) + page_values = T.match_buffer(var_page_values, (nnz_pages,), "int32", offset_factor=1) + length_info = T.match_buffer(var_length_info, (batch_size,), "int32", offset_factor=1) + k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1) + q_rope_position = T.match_buffer(var_q_rope_position, (total_len,), "int32", offset_factor=1) + output = T.match_buffer(var_output, (total_len, 20, 64), "float16") + lse = T.match_buffer(var_lse, (total_len, 20)) + # with T.block("root"): + for lbx in T.thread_binding(16, thread="blockIdx.x"): + for lby in T.thread_binding(20, thread="blockIdx.y"): + for lty in T.thread_binding(4, thread="threadIdx.y"): + for ltx in T.thread_binding(32, thread="threadIdx.x"): + with T.block("attn"): + bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) + T.reads() + T.writes() + tile_id = T.alloc_buffer((1,), "int32", scope="local") + batch_idx = T.alloc_buffer((1,), "int32", scope="local") + batch_tiles = T.alloc_buffer((1,), "int32", scope="local") + batch_rows = T.alloc_buffer((1,), "int32", scope="local") + iterator = T.alloc_buffer((1,), "int32", scope="local") + kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") + Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") + K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + S_smem = T.alloc_buffer((32, 16), scope="shared") + S_local = T.alloc_buffer((32, 16), scope="local") + O_local = T.alloc_buffer((32, 64), scope="local") + m_smem = T.alloc_buffer((32,), scope="shared") + m_prev_smem = T.alloc_buffer((32,), scope="shared") + d_smem = T.alloc_buffer((32,), scope="shared") + m_new = T.alloc_buffer((1,), scope="local") + m_prev = T.alloc_buffer((1,), scope="local") + d_new = T.alloc_buffer((1,), scope="local") + tile_id[0] = bx + batch_idx[0] = 0 + batch_rows[0] = q_indptr[1] - q_indptr[0] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + while T.tvm_thread_invariant(batch_idx[0] < batch_size): + while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: + tile_id[0] = tile_id[0] - batch_tiles[0] + batch_idx[0] = batch_idx[0] + 1 + if batch_idx[0] < batch_size: + b_idx: T.int32 = batch_idx[0] + batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + if T.tvm_thread_invariant(batch_idx[0] < batch_size): + b_idx: T.int32 = batch_idx[0] + LH_start: T.int32 = tile_id[0] * 32 + q_indptr_val: T.int32 = q_indptr[b_idx] + cur_page_indptr_begin: T.int32 = page_indptr[b_idx] + cur_page_indptr_end: T.int32 = page_indptr[b_idx + 1] + kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[b_idx], 0) + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + m_smem[row] = T.float32(-50000) + d_smem[row] = T.float32(1) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = T.float32(0) + T.tvm_storage_sync("shared") + for li_lj_fused_0 in range(4): + for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for li_lj_fused_3 in T.vectorized(4): + with T.block("Q_load"): + i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) + j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = q_indptr_val + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j]) + else: + Q_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + for iterator_1 in range((kv_chunk_len[0] + 15) // 16): + L_kv_start: T.int32 = iterator_1 * 16 + for lz_ly_fused_0 in range(2): + for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for lz_ly_fused_3 in T.vectorized(4): + with T.block("K_load"): + i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) + j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = L_kv_start + i + if cur_L < kv_chunk_len[0]: + seq_offset: T.int32 = cur_L + page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] + page_offset: T.int32 = seq_offset % 16 + K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, pages[page_no, 0, by, page_offset, j + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, j - 32]))), pages[page_no, 0, by, page_offset, j]) + else: + K_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + for lz_ly_fused_0 in range(2): + for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for lz_ly_fused_3 in T.vectorized(4): + with T.block("V_load"): + i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) + j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = L_kv_start + i + if cur_L < kv_chunk_len[0]: + seq_offset: T.int32 = cur_L + page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] + page_offset: T.int32 = seq_offset % 16 + V_smem[i, j] = pages[page_no, 1, by, page_offset, j] + else: + V_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) + T.writes(S_local[0:32, 0:16]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(2, 2): + with T.block("S_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) + j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) + T.reads() + T.writes(S_local[i, j]) + S_local[i, j] = T.float32(0) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): + with T.block("S_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + k = T.axis.reduce(64, lk_0 * 8 + lk_1) + T.reads(S_local[i, j], Q_smem[i, k], K_smem[j, k]) + T.writes(S_local[i, j]) + S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k]) * T.Cast("float32", K_smem[j, k]) * attn_score_scaling_factor * T.float32(0.18033688011112042) + T.tvm_storage_sync("shared") + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(2, 2): + with T.block("S_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + T.reads(S_local[i, j]) + T.writes(S_smem[i, j]) + S_smem[i, j] = S_local[i, j] + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update1"): + T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) + T.writes(m_prev[i], m_new[i], d_new[i]) + m_prev[i] = m_smem[row] + m_new[i] = m_smem[row] + row_: T.int32 = LH_start + row + for j in range(16): + if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): + m_new[i] = T.max(m_new[i], S_smem[row, j]) + d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + with T.block("update"): + T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) + T.writes(S_smem[row, 0:16]) + for j in range(16): + if row < 32: + row_: T.int32 = LH_start + row + if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): + S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) + else: + S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update"): + T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) + T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) + for j in range(16): + d_new[i] = d_new[i] + S_smem[row, j] + m_smem[row] = m_new[i] + d_smem[row] = d_new[i] + m_prev_smem[row] = m_prev[i] + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) + T.writes(O_local[0:32, 0:64]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(4, 4): + with T.block("O_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) + j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): + with T.block("O_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + k = T.axis.reduce(16, lk_0 * 8 + lk_1) + T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k], V_smem[k, j]) + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] + S_smem[i, k] * T.Cast("float32", V_smem[k, j]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) + T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) + for li_0 in range(1): + for li_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_2 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("lse_store"): + i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) + T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) + T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) + T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) + tile_id[0] = tile_id[0] + 16 + + @T.prim_func + def batch_prefill_paged_kv_sliding_window(_0: T.int32, var_q: T.handle, var_q_indptr: T.handle, var_pages: T.handle, var_page_indptr: T.handle, var_page_values: T.handle, var_length_info: T.handle, var_k_rope_pos_offset: T.handle, var_q_rope_position: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + total_len = T.int32(is_size_var=True) + q = T.match_buffer(var_q, (total_len, 20, 64), "float16") + batch_size = T.int32(is_size_var=True) + q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) + max_num_pages = T.int32(is_size_var=True) + pages = T.match_buffer(var_pages, (max_num_pages, 2, 20, 16, 64), "float16") + page_indptr = T.match_buffer(var_page_indptr, (batch_size + 1,), "int32", offset_factor=1) + nnz_pages = T.int32(is_size_var=True) + page_values = T.match_buffer(var_page_values, (nnz_pages,), "int32", offset_factor=1) + length_info = T.match_buffer(var_length_info, (3, batch_size), "int32", offset_factor=1) + k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1) + q_rope_position = T.match_buffer(var_q_rope_position, (total_len,), "int32", offset_factor=1) + output = T.match_buffer(var_output, (total_len, 20, 64), "float16") + lse = T.match_buffer(var_lse, (total_len, 20)) + # with T.block("root"): + for lbx in T.thread_binding(16, thread="blockIdx.x"): + for lby in T.thread_binding(20, thread="blockIdx.y"): + for lty in T.thread_binding(4, thread="threadIdx.y"): + for ltx in T.thread_binding(32, thread="threadIdx.x"): + with T.block("attn"): + bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) + T.reads() + T.writes() + tile_id = T.alloc_buffer((1,), "int32", scope="local") + batch_idx = T.alloc_buffer((1,), "int32", scope="local") + batch_tiles = T.alloc_buffer((1,), "int32", scope="local") + batch_rows = T.alloc_buffer((1,), "int32", scope="local") + iterator = T.alloc_buffer((1,), "int32", scope="local") + kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") + Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") + K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + S_smem = T.alloc_buffer((32, 16), scope="shared") + S_local = T.alloc_buffer((32, 16), scope="local") + O_local = T.alloc_buffer((32, 64), scope="local") + m_smem = T.alloc_buffer((32,), scope="shared") + m_prev_smem = T.alloc_buffer((32,), scope="shared") + d_smem = T.alloc_buffer((32,), scope="shared") + m_new = T.alloc_buffer((1,), scope="local") + m_prev = T.alloc_buffer((1,), scope="local") + d_new = T.alloc_buffer((1,), scope="local") + tile_id[0] = bx + batch_idx[0] = 0 + batch_rows[0] = q_indptr[1] - q_indptr[0] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + while T.tvm_thread_invariant(batch_idx[0] < batch_size): + while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: + tile_id[0] = tile_id[0] - batch_tiles[0] + batch_idx[0] = batch_idx[0] + 1 + if batch_idx[0] < batch_size: + b_idx: T.int32 = batch_idx[0] + batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + if T.tvm_thread_invariant(batch_idx[0] < batch_size): + b_idx: T.int32 = batch_idx[0] + LH_start: T.int32 = tile_id[0] * 32 + q_indptr_val: T.int32 = q_indptr[b_idx] + cur_page_indptr_begin: T.int32 = page_indptr[b_idx] + cur_page_indptr_end: T.int32 = page_indptr[b_idx + 1] + kv_chunk_len[0] = T.if_then_else(cur_page_indptr_begin != cur_page_indptr_end, (cur_page_indptr_end - cur_page_indptr_begin - 1) * 16 + length_info[0, b_idx] - length_info[1, b_idx] + length_info[2, b_idx], 0) + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + m_smem[row] = T.float32(-50000) + d_smem[row] = T.float32(1) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = T.float32(0) + T.tvm_storage_sync("shared") + for li_lj_fused_0 in range(4): + for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for li_lj_fused_3 in T.vectorized(4): + with T.block("Q_load"): + i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) + j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = q_indptr_val + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j]) + else: + Q_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + for iterator_1 in range((kv_chunk_len[0] + 15) // 16): + L_kv_start: T.int32 = iterator_1 * 16 + for lz_ly_fused_0 in range(2): + for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for lz_ly_fused_3 in T.vectorized(4): + with T.block("K_load"): + i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) + j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = L_kv_start + i + if cur_L < kv_chunk_len[0]: + seq_offset: T.int32 = T.if_then_else(cur_L < length_info[2, b_idx], cur_L, cur_L - length_info[2, b_idx] + length_info[1, b_idx]) + page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] + page_offset: T.int32 = seq_offset % 16 + K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", pages[page_no, 0, by, page_offset, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, pages[page_no, 0, by, page_offset, j + 32] * T.float16(-1), pages[page_no, 0, by, page_offset, j - 32]))), pages[page_no, 0, by, page_offset, j]) + else: + K_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + for lz_ly_fused_0 in range(2): + for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for lz_ly_fused_3 in T.vectorized(4): + with T.block("V_load"): + i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) + j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = L_kv_start + i + if cur_L < kv_chunk_len[0]: + seq_offset: T.int32 = T.if_then_else(cur_L < length_info[2, b_idx], cur_L, cur_L - length_info[2, b_idx] + length_info[1, b_idx]) + page_no: T.int32 = page_values[cur_page_indptr_begin + seq_offset // 16] + page_offset: T.int32 = seq_offset % 16 + V_smem[i, j] = pages[page_no, 1, by, page_offset, j] + else: + V_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) + T.writes(S_local[0:32, 0:16]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(2, 2): + with T.block("S_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) + j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) + T.reads() + T.writes(S_local[i, j]) + S_local[i, j] = T.float32(0) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): + with T.block("S_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + k = T.axis.reduce(64, lk_0 * 8 + lk_1) + T.reads(S_local[i, j], Q_smem[i, k], K_smem[j, k]) + T.writes(S_local[i, j]) + S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k]) * T.Cast("float32", K_smem[j, k]) * attn_score_scaling_factor * T.float32(0.18033688011112042) + T.tvm_storage_sync("shared") + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(2, 2): + with T.block("S_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + T.reads(S_local[i, j]) + T.writes(S_smem[i, j]) + S_smem[i, j] = S_local[i, j] + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update1"): + T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) + T.writes(m_prev[i], m_new[i], d_new[i]) + m_prev[i] = m_smem[row] + m_new[i] = m_smem[row] + row_: T.int32 = LH_start + row + for j in range(16): + if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): + m_new[i] = T.max(m_new[i], S_smem[row, j]) + d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + with T.block("update"): + T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) + T.writes(S_smem[row, 0:16]) + for j in range(16): + if row < 32: + row_: T.int32 = LH_start + row + if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): + S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) + else: + S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update"): + T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) + T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) + for j in range(16): + d_new[i] = d_new[i] + S_smem[row, j] + m_smem[row] = m_new[i] + d_smem[row] = d_new[i] + m_prev_smem[row] = m_prev[i] + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) + T.writes(O_local[0:32, 0:64]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(4, 4): + with T.block("O_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) + j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): + with T.block("O_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + k = T.axis.reduce(16, lk_0 * 8 + lk_1) + T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k], V_smem[k, j]) + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] + S_smem[i, k] * T.Cast("float32", V_smem[k, j]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) + T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) + for li_0 in range(1): + for li_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_2 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("lse_store"): + i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) + T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) + T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) + T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) + tile_id[0] = tile_id[0] + 16 + + @T.prim_func + def batch_prefill_ragged_kv(var_q: T.handle, var_q_indptr: T.handle, var_k: T.handle, var_v: T.handle, var_kv_indptr: T.handle, var_q_rope_position: T.handle, var_k_rope_pos_offset: T.handle, var_output: T.handle, var_lse: T.handle, causal: T.int32, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + qo_len = T.int32(is_size_var=True) + q = T.match_buffer(var_q, (qo_len, 20, 64), "float16") + batch_size = T.int32(is_size_var=True) + q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) + kv_len = T.int32(is_size_var=True) + k = T.match_buffer(var_k, (kv_len, 20, 64), "float16") + v = T.match_buffer(var_v, (kv_len, 20, 64), "float16") + kv_indptr = T.match_buffer(var_kv_indptr, (batch_size + 1,), "int32", offset_factor=1) + q_rope_position = T.match_buffer(var_q_rope_position, (qo_len,), "int32", offset_factor=1) + k_rope_pos_offset = T.match_buffer(var_k_rope_pos_offset, (batch_size,), "int32", offset_factor=1) + output = T.match_buffer(var_output, (qo_len, 20, 64), "float16") + lse = T.match_buffer(var_lse, (qo_len, 20)) + # with T.block("root"): + for lbx in T.thread_binding(16, thread="blockIdx.x"): + for lby in T.thread_binding(20, thread="blockIdx.y"): + for lty in T.thread_binding(4, thread="threadIdx.y"): + for ltx in T.thread_binding(32, thread="threadIdx.x"): + with T.block("attn"): + bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) + T.reads() + T.writes() + tile_id = T.alloc_buffer((1,), "int32", scope="local") + batch_idx = T.alloc_buffer((1,), "int32", scope="local") + batch_tiles = T.alloc_buffer((1,), "int32", scope="local") + batch_rows = T.alloc_buffer((1,), "int32", scope="local") + iterator = T.alloc_buffer((1,), "int32", scope="local") + kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") + Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") + K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + S_smem = T.alloc_buffer((32, 16), scope="shared") + S_local = T.alloc_buffer((32, 16), scope="local") + O_local = T.alloc_buffer((32, 64), scope="local") + m_smem = T.alloc_buffer((32,), scope="shared") + m_prev_smem = T.alloc_buffer((32,), scope="shared") + d_smem = T.alloc_buffer((32,), scope="shared") + m_new = T.alloc_buffer((1,), scope="local") + m_prev = T.alloc_buffer((1,), scope="local") + d_new = T.alloc_buffer((1,), scope="local") + tile_id[0] = bx + batch_idx[0] = 0 + batch_rows[0] = q_indptr[1] - q_indptr[0] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + while T.tvm_thread_invariant(batch_idx[0] < batch_size): + while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: + tile_id[0] = tile_id[0] - batch_tiles[0] + batch_idx[0] = batch_idx[0] + 1 + if batch_idx[0] < batch_size: + b_idx: T.int32 = batch_idx[0] + batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + if T.tvm_thread_invariant(batch_idx[0] < batch_size): + b_idx: T.int32 = batch_idx[0] + q_indptr_val: T.int32 = q_indptr[b_idx] + LH_start: T.int32 = tile_id[0] * 32 + kv_chunk_len[0] = kv_indptr[b_idx + 1] - kv_indptr[b_idx] + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + m_smem[row] = T.float32(-50000) + d_smem[row] = T.float32(1) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = T.float32(0) + T.tvm_storage_sync("shared") + for li_lj_fused_0 in range(4): + for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for li_lj_fused_3 in T.vectorized(4): + with T.block("Q_load"): + i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) + j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = q_indptr_val + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", q[cur_L, cur_H_qo, j]) + T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]))), q[cur_L, cur_H_qo, j]) + else: + Q_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + for iterator_1 in range((kv_chunk_len[0] + 15) // 16): + L_kv_start: T.int32 = iterator_1 * 16 + L_kv_base: T.int32 = kv_indptr[b_idx] + for lz_ly_fused_0 in range(2): + for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for lz_ly_fused_3 in T.vectorized(4): + with T.block("K_load"): + i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) + j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = L_kv_start + i + if cur_L < kv_chunk_len[0]: + K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", k[L_kv_base + cur_L, by, j]) + T.sin(T.Cast("float32", k_rope_pos_offset[b_idx] + cur_L) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(j < 32, k[L_kv_base + cur_L, by, j + 32] * T.float16(-1), k[L_kv_base + cur_L, by, j - 32]))), k[L_kv_base + cur_L, by, j]) + else: + K_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + for lz_ly_fused_0 in range(2): + for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for lz_ly_fused_3 in T.vectorized(4): + with T.block("V_load"): + i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) + j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = L_kv_start + i + if cur_L < kv_chunk_len[0]: + V_smem[i, j] = v[L_kv_base + cur_L, by, j] + else: + V_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) + T.writes(S_local[0:32, 0:16]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(2, 2): + with T.block("S_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) + j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) + T.reads() + T.writes(S_local[i, j]) + S_local[i, j] = T.float32(0) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): + with T.block("S_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + k_1 = T.axis.reduce(64, lk_0 * 8 + lk_1) + T.reads(S_local[i, j], Q_smem[i, k_1], K_smem[j, k_1]) + T.writes(S_local[i, j]) + S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k_1]) * T.Cast("float32", K_smem[j, k_1]) * attn_score_scaling_factor * T.float32(0.18033688011112042) + T.tvm_storage_sync("shared") + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(2, 2): + with T.block("S_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + T.reads(S_local[i, j]) + T.writes(S_smem[i, j]) + S_smem[i, j] = S_local[i, j] + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update1"): + T.reads(m_smem[row], kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) + T.writes(m_prev[i], m_new[i], d_new[i]) + m_prev[i] = m_smem[row] + m_new[i] = m_smem[row] + row_: T.int32 = LH_start + row + for j in range(16): + if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): + m_new[i] = T.max(m_new[i], S_smem[row, j]) + d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + with T.block("update"): + T.reads(kv_chunk_len[0], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) + T.writes(S_smem[row, 0:16]) + for j in range(16): + if row < 32: + row_: T.int32 = LH_start + row + if T.if_then_else(causal > 0, L_kv_start + j < kv_chunk_len[0] - (q_indptr[b_idx + 1] - q_indptr[b_idx]) + row_ + 1, L_kv_start + j < kv_chunk_len[0]): + S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) + else: + S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update"): + T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) + T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) + for j in range(16): + d_new[i] = d_new[i] + S_smem[row, j] + m_smem[row] = m_new[i] + d_smem[row] = d_new[i] + m_prev_smem[row] = m_prev[i] + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) + T.writes(O_local[0:32, 0:64]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(4, 4): + with T.block("O_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) + j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): + with T.block("O_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + k_1 = T.axis.reduce(16, lk_0 * 8 + lk_1) + T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k_1], V_smem[k_1, j]) + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] + S_smem[i, k_1] * T.Cast("float32", V_smem[k_1, j]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) + T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) + for li_0 in range(1): + for li_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_2 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("lse_store"): + i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) + T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) + T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) + T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) + tile_id[0] = tile_id[0] + 16 + + @T.prim_func + def batch_tree_attn(var_q: T.handle, var_q_indptr: T.handle, var_k: T.handle, var_v: T.handle, var_kv_indptr: T.handle, var_q_rope_position: T.handle, var_mn_indptr: T.handle, var_mask: T.handle, var_output: T.handle, var_lse: T.handle, rotary_mode: T.int32, rope_scale: T.float32, rope_theta: T.float32, attn_score_scaling_factor: T.float32, batch_size: T.int32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + qo_len = T.int32(is_size_var=True) + q = T.match_buffer(var_q, (qo_len, 20, 64), "float16") + q_indptr = T.match_buffer(var_q_indptr, (batch_size + 1,), "int32", offset_factor=1) + kv_len = T.int32(is_size_var=True) + k = T.match_buffer(var_k, (kv_len, 20, 64), "float16") + v = T.match_buffer(var_v, (kv_len, 20, 64), "float16") + kv_indptr = T.match_buffer(var_kv_indptr, (batch_size + 1,), "int32", offset_factor=1) + q_rope_position = T.match_buffer(var_q_rope_position, (qo_len,), "int32", offset_factor=1) + mn_indptr = T.match_buffer(var_mn_indptr, (batch_size + 1,), "int32", offset_factor=1) + tree_size = T.int32(is_size_var=True) + mask = T.match_buffer(var_mask, (tree_size,), "int32", offset_factor=1) + output = T.match_buffer(var_output, (qo_len, 20, 64), "float16") + lse = T.match_buffer(var_lse, (qo_len, 20)) + # with T.block("root"): + for lbx in T.thread_binding(16, thread="blockIdx.x"): + for lby in T.thread_binding(20, thread="blockIdx.y"): + for lty in T.thread_binding(4, thread="threadIdx.y"): + for ltx in T.thread_binding(32, thread="threadIdx.x"): + with T.block("attn"): + bx, by, ty, tx = T.axis.remap("SSSS", [lbx, lby, lty, ltx]) + T.reads() + T.writes() + tile_id = T.alloc_buffer((1,), "int32", scope="local") + batch_idx = T.alloc_buffer((1,), "int32", scope="local") + batch_tiles = T.alloc_buffer((1,), "int32", scope="local") + batch_rows = T.alloc_buffer((1,), "int32", scope="local") + iterator = T.alloc_buffer((1,), "int32", scope="local") + kv_chunk_len = T.alloc_buffer((1,), "int32", scope="local") + Q_smem = T.alloc_buffer((32, 64), "float16", scope="shared") + K_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + V_smem = T.alloc_buffer((16, 64), "float16", scope="shared") + S_smem = T.alloc_buffer((32, 16), scope="shared") + S_local = T.alloc_buffer((32, 16), scope="local") + O_local = T.alloc_buffer((32, 64), scope="local") + m_smem = T.alloc_buffer((32,), scope="shared") + m_prev_smem = T.alloc_buffer((32,), scope="shared") + d_smem = T.alloc_buffer((32,), scope="shared") + m_new = T.alloc_buffer((1,), scope="local") + m_prev = T.alloc_buffer((1,), scope="local") + d_new = T.alloc_buffer((1,), scope="local") + tile_id[0] = bx + batch_idx[0] = 0 + batch_rows[0] = q_indptr[1] - q_indptr[0] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + while T.tvm_thread_invariant(batch_idx[0] < batch_size): + while tile_id[0] >= batch_tiles[0] and batch_idx[0] < batch_size: + tile_id[0] = tile_id[0] - batch_tiles[0] + batch_idx[0] = batch_idx[0] + 1 + if batch_idx[0] < batch_size: + b_idx: T.int32 = batch_idx[0] + batch_rows[0] = q_indptr[b_idx + 1] - q_indptr[b_idx] + batch_tiles[0] = (batch_rows[0] + 32 - 1) // 32 + if T.tvm_thread_invariant(batch_idx[0] < batch_size): + b_idx: T.int32 = batch_idx[0] + LH_start: T.int32 = tile_id[0] * 32 + q_indptr_val: T.int32 = q_indptr[b_idx] + kv_chunk_len[0] = kv_indptr[b_idx + 1] - kv_indptr[b_idx] + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + m_smem[row] = T.float32(-50000) + d_smem[row] = T.float32(1) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = T.float32(0) + T.tvm_storage_sync("shared") + for li_lj_fused_0 in range(4): + for li_lj_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_lj_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for li_lj_fused_3 in T.vectorized(4): + with T.block("Q_load"): + i = T.axis.spatial(32, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) // 64) + j = T.axis.spatial(64, (li_lj_fused_0 * 512 + li_lj_fused_1 * 128 + li_lj_fused_2 * 4 + li_lj_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = q_indptr_val + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + Q_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * q[cur_L, cur_H_qo, j] + T.Cast("float16", T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * T.if_then_else(j < 32, q[cur_L, cur_H_qo, j + 32] * T.float16(-1), q[cur_L, cur_H_qo, j - 32]), q[cur_L, cur_H_qo, j]) + else: + Q_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + for iterator_1 in range((kv_chunk_len[0] + 15) // 16): + L_kv_start: T.int32 = iterator_1 * 16 + L_kv_base: T.int32 = kv_indptr[b_idx] + for lz_ly_fused_0 in range(2): + for lz_ly_fused_1 in T.thread_binding(4, thread="threadIdx.y"): + for lz_ly_fused_2 in T.thread_binding(32, thread="threadIdx.x"): + for lz_ly_fused_3 in T.vectorized(4): + with T.block("KV_load"): + i = T.axis.spatial(16, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) // 64) + j = T.axis.spatial(64, (lz_ly_fused_0 * 512 + lz_ly_fused_1 * 128 + lz_ly_fused_2 * 4 + lz_ly_fused_3) % 64) + T.reads() + T.writes() + cur_L: T.int32 = L_kv_base + L_kv_start + i + if L_kv_start + i < kv_chunk_len[0]: + K_smem[i, j] = T.if_then_else(rotary_mode == 1, T.Cast("float16", T.cos(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * k[cur_L, by, j] + T.Cast("float16", T.sin(T.Cast("float32", q_rope_position[cur_L]) * rope_scale / T.pow(rope_theta, T.Cast("float32", j * 2 % 64) / T.float32(64)))) * T.if_then_else(j < 32, k[cur_L, by, j + 32] * T.float16(-1), k[cur_L, by, j - 32]), k[cur_L, by, j]) + V_smem[i, j] = v[cur_L, by, j] + else: + K_smem[i, j] = T.float16(0) + V_smem[i, j] = T.float16(0) + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(Q_smem[0:32, 0:64], K_smem[0:16, 0:64]) + T.writes(S_local[0:32, 0:16]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(2, 2): + with T.block("S_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 8 * 2 + li_1_init) + j = T.axis.spatial(16, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 8 * 2 + lj_1_init) + T.reads() + T.writes(S_local[i, j]) + S_local[i, j] = T.float32(0) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, li_1, lj_1, lk_1 in T.grid(8, 2, 2, 8): + with T.block("S_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + k_1 = T.axis.reduce(64, lk_0 * 8 + lk_1) + T.reads(S_local[i, j], Q_smem[i, k_1], K_smem[j, k_1]) + T.writes(S_local[i, j]) + S_local[i, j] = S_local[i, j] + T.Cast("float32", Q_smem[i, k_1]) * T.Cast("float32", K_smem[j, k_1]) * attn_score_scaling_factor * T.float32(0.18033688011112042) + T.tvm_storage_sync("shared") + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(2, 2): + with T.block("S_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 8 * 2 + li_1) + j = T.axis.spatial(16, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 8 * 2 + lj_1) + T.reads(S_local[i, j]) + T.writes(S_smem[i, j]) + S_smem[i, j] = S_local[i, j] + T.tvm_storage_sync("shared") + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update1"): + T.reads(m_smem[row], kv_chunk_len[0], mask[mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start:mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start + 16], mn_indptr[b_idx], q_indptr[b_idx:b_idx + 2], m_new[i], S_smem[row, 0:16], d_smem[row], m_prev[i]) + T.writes(m_prev[i], m_new[i], d_new[i]) + m_prev[i] = m_smem[row] + m_new[i] = m_smem[row] + row_: T.int32 = LH_start + row + for j in range(16): + if L_kv_start + j < kv_chunk_len[0] and mask[mn_indptr[b_idx] + row_ * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + (L_kv_start + j)] == 1: + m_new[i] = T.max(m_new[i], S_smem[row, j]) + d_new[i] = d_smem[row] * T.exp2(m_prev[i] - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + with T.block("update"): + T.reads(kv_chunk_len[0], mask[mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start:mn_indptr[b_idx] + (LH_start + row) * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + L_kv_start + 16], mn_indptr[b_idx], q_indptr[b_idx:b_idx + 2], S_smem[row, 0:16], m_new[i]) + T.writes(S_smem[row, 0:16]) + for j in range(16): + if row < 32: + row_: T.int32 = LH_start + row + if L_kv_start + j < kv_chunk_len[0] and mask[mn_indptr[b_idx] + row_ * (q_indptr[b_idx + 1] - q_indptr[b_idx]) + (L_kv_start + j)] == 1: + S_smem[row, j] = T.exp2(S_smem[row, j] - m_new[i]) + else: + S_smem[row, j] = T.exp2(T.float32(-50000) - m_new[i]) + for i in range(1): + row: T.int32 = i * 32 * 4 + ty * 32 + tx + if row < 32: + with T.block("update"): + T.reads(d_new[i], S_smem[row, 0:16], m_new[i], m_prev[i]) + T.writes(d_new[i], m_smem[row], d_smem[row], m_prev_smem[row]) + for j in range(16): + d_new[i] = d_new[i] + S_smem[row, j] + m_smem[row] = m_new[i] + d_smem[row] = d_new[i] + m_prev_smem[row] = m_prev[i] + T.tvm_storage_sync("shared") + with T.block(""): + T.reads(m_prev_smem[0:32], m_smem[0:32], S_smem[0:32, 0:16], V_smem[0:16, 0:64]) + T.writes(O_local[0:32, 0:64]) + for li_0_lj_0_fused_0_init in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1_init in T.thread_binding(32, thread="threadIdx.x"): + for li_1_init, lj_1_init in T.grid(4, 4): + with T.block("O_gemm_init"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) // 16 * 4 + li_1_init) + j = T.axis.spatial(64, (li_0_lj_0_fused_0_init * 32 + li_0_lj_0_fused_1_init) % 16 * 4 + lj_1_init) + T.reads() + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] * T.exp2(m_prev_smem[i] - m_smem[i]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for lk_0, lk_1, li_1, lj_1 in T.grid(2, 8, 4, 4): + with T.block("O_gemm_update"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + k_1 = T.axis.reduce(16, lk_0 * 8 + lk_1) + T.reads(O_local[i, j], m_prev_smem[i], m_smem[i], S_smem[i, k_1], V_smem[k_1, j]) + T.writes(O_local[i, j]) + O_local[i, j] = O_local[i, j] + S_smem[i, k_1] * T.Cast("float32", V_smem[k_1, j]) + for li_0_lj_0_fused_0 in T.thread_binding(4, thread="threadIdx.y"): + for li_0_lj_0_fused_1 in T.thread_binding(32, thread="threadIdx.x"): + for li_1, lj_1 in T.grid(4, 4): + with T.block("O_store"): + i = T.axis.spatial(32, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) // 16 * 4 + li_1) + j = T.axis.spatial(64, (li_0_lj_0_fused_0 * 32 + li_0_lj_0_fused_1) % 16 * 4 + lj_1) + T.reads(q_indptr[b_idx:b_idx + 2], O_local[i, j], d_smem[i]) + T.writes(output[q_indptr[b_idx] + (LH_start + i), by, j]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + output[cur_L, cur_H_qo, j] = T.Cast("float16", O_local[i, j] / d_smem[i]) + for li_0 in range(1): + for li_1 in T.thread_binding(4, thread="threadIdx.y"): + for li_2 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("lse_store"): + i = T.axis.spatial(32, li_0 * 128 + li_1 * 32 + li_2) + T.where((li_0 * 4 + li_1) * 32 + li_2 < 32) + T.reads(q_indptr[b_idx:b_idx + 2], m_smem[i], d_smem[i]) + T.writes(lse[q_indptr[b_idx] + (LH_start + i), by]) + cur_L: T.int32 = q_indptr[b_idx] + (LH_start + i) + cur_H_qo: T.int32 = by + if cur_L < q_indptr[b_idx + 1]: + lse[cur_L, cur_H_qo] = m_smem[i] + T.log2(d_smem[i]) + tile_id[0] = tile_id[0] + 16 + + @T.prim_func + def batch_verify_on_gpu_single_kernel(var_draft_probs: T.handle, var_draft_tokens: T.handle, var_model_probs: T.handle, var_token_tree_first_child: T.handle, var_token_tree_next_sibling: T.handle, var_uniform_samples: T.handle, var_token_tree_parent_ptr: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + num_nodes, vocab_size = T.int32(is_size_var=True), T.int64() + draft_probs = T.match_buffer(var_draft_probs, (num_nodes, vocab_size)) + draft_tokens = T.match_buffer(var_draft_tokens, (num_nodes,), "int32") + model_probs = T.match_buffer(var_model_probs, (num_nodes, vocab_size)) + token_tree_first_child = T.match_buffer(var_token_tree_first_child, (num_nodes,), "int32") + token_tree_next_sibling = T.match_buffer(var_token_tree_next_sibling, (num_nodes,), "int32") + uniform_samples = T.match_buffer(var_uniform_samples, (num_nodes,)) + nbatch = T.int32(is_size_var=True) + token_tree_parent_ptr = T.match_buffer(var_token_tree_parent_ptr, (nbatch,), "int32") + # with T.block("root"): + child_ptr = T.alloc_buffer((1,), "int32", scope="local") + parent_ptr = T.alloc_buffer((1,), "int32", scope="local") + child_token = T.alloc_buffer((1,), "int32", scope="local") + done = T.alloc_buffer((1,), "bool", scope="local") + psum = T.alloc_buffer((1,), scope="local") + t0 = T.alloc_buffer((1,), scope="local") + model_prob_local = T.alloc_buffer((1,), scope="local") + draft_prob_local = T.alloc_buffer((1,), scope="local") + p_child = T.alloc_buffer((1,), scope="local") + q_child = T.alloc_buffer((1,), scope="local") + uniform_sample = T.alloc_buffer((1,), scope="local") + pred_shared = T.alloc_buffer((1,), "bool", scope="shared") + pred_local = T.alloc_buffer((1,), "bool", scope="local") + for _bx in T.thread_binding(nbatch, thread="blockIdx.x"): + for _tx in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("CTA"): + b, tx = T.axis.remap("SS", [_bx, _tx]) + T.reads(token_tree_parent_ptr[b], token_tree_first_child[T.min(parent_ptr[0], child_ptr[0]):T.min(parent_ptr[0], child_ptr[0]) + (T.max(parent_ptr[0], child_ptr[0]) + 1 - T.min(parent_ptr[0], child_ptr[0]))], parent_ptr[0], done[0], child_ptr[0], draft_tokens[child_ptr[0]], model_probs[parent_ptr[0], T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)):T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)) + (T.max(T.Cast("int64", child_token[0]), (vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) + T.Cast("int64", tx) - T.int64(1024)) + T.int64(1) - T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)))], child_token[0], draft_probs[child_ptr[0], T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)):T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)) + (T.max(T.Cast("int64", child_token[0]), (vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) + T.Cast("int64", tx) - T.int64(1024)) + T.int64(1) - T.min(T.Cast("int64", child_token[0]), T.Cast("int64", tx)))], uniform_samples[child_ptr[0]], p_child[0], uniform_sample[0], q_child[0], pred_shared[0], pred_local[0], model_prob_local[0], draft_prob_local[0], psum[0], t0[0], token_tree_next_sibling[child_ptr[0]]) + T.writes(parent_ptr[0], child_ptr[0], done[0], child_token[0], p_child[0], q_child[0], uniform_sample[0], pred_shared[0], pred_local[0], psum[0], model_prob_local[0], draft_prob_local[0], t0[0], model_probs[parent_ptr[0], T.Cast("int64", tx):T.Cast("int64", tx) + ((vocab_size + T.int64(1023)) // T.int64(1024) * T.int64(1024) - T.int64(1023))], token_tree_parent_ptr[b]) + parent_ptr[0] = token_tree_parent_ptr[b] + child_ptr[0] = token_tree_first_child[parent_ptr[0]] + done[0] = T.bool(False) + while not done[0]: + T.tvm_storage_sync("shared") + if child_ptr[0] == -1: + done[0] = T.bool(True) + T.tvm_storage_sync("shared") + else: + if tx == 0: + child_token[0] = draft_tokens[child_ptr[0]] + p_child[0] = model_probs[parent_ptr[0], child_token[0]] + q_child[0] = draft_probs[child_ptr[0], child_token[0]] + uniform_sample[0] = uniform_samples[child_ptr[0]] + pred_shared[0] = p_child[0] >= uniform_sample[0] * q_child[0] + T.tvm_storage_sync("shared") + pred_local[0] = pred_shared[0] + if pred_local[0]: + parent_ptr[0] = child_ptr[0] + child_ptr[0] = token_tree_first_child[child_ptr[0]] + else: + psum[0] = T.float32(0) + for i in range((vocab_size + T.int64(1023)) // T.int64(1024)): + if i * T.int64(1024) + T.Cast("int64", tx) < vocab_size: + model_prob_local[0] = model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] + draft_prob_local[0] = draft_probs[child_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] + model_prob_local[0] = T.max(model_prob_local[0] - draft_prob_local[0], T.float32(0)) + psum[0] = psum[0] + model_prob_local[0] + with T.block("block_cross_thread"): + T.reads(psum[0]) + T.writes(t0[0]) + T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) + T.tvm_thread_allreduce(T.uint32(1), psum[0], T.bool(True), t0[0], tx) + if t0[0] < T.float32(9.9999999999999995e-08): + parent_ptr[0] = child_ptr[0] + child_ptr[0] = token_tree_first_child[child_ptr[0]] + else: + for i in range((vocab_size + T.int64(1023)) // T.int64(1024)): + if i * T.int64(1024) + T.Cast("int64", tx) < vocab_size: + model_prob_local[0] = model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] + draft_prob_local[0] = draft_probs[child_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] + model_prob_local[0] = T.max(model_prob_local[0] - draft_prob_local[0], T.float32(0)) + model_probs[parent_ptr[0], i * T.int64(1024) + T.Cast("int64", tx)] = model_prob_local[0] / t0[0] + child_ptr[0] = token_tree_next_sibling[child_ptr[0]] + if tx == 0: + token_tree_parent_ptr[b] = parent_ptr[0] + + @T.prim_func + def chunk_lse(var_A: T.handle, var_temperature: T.handle, var_chunked_sum: T.handle, var_chunked_max: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True) + A = T.match_buffer(var_A, (batch_size, vocab_size)) + temperature = T.match_buffer(var_temperature, (batch_size,)) + num_chunks = T.int64(is_size_var=True) + chunked_sum = T.match_buffer(var_chunked_sum, (batch_size, num_chunks)) + chunked_max = T.match_buffer(var_chunked_max, (batch_size, num_chunks)) + # with T.block("root"): + temp_max_shared = T.alloc_buffer((batch_size, num_chunks), scope="shared") + temp_sum_shared = T.alloc_buffer((batch_size, num_chunks), scope="shared") + for ax0_ax1_fused in T.thread_binding(batch_size * num_chunks, thread="blockIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax2_fused_0 in T.serial(T.int64(16), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("max"): + v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks + ax0) + v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks + ax1) + v2 = T.axis.reduce(T.int64(4096), ax2_fused_0 * T.int64(256) + ax2_fused_1) + T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2]) + T.writes(temp_max_shared[v0, v1]) + with T.init(): + temp_max_shared[v0, v1] = T.float32(-3.4028234663852886e+38) + temp_max_shared[v0, v1] = T.max(temp_max_shared[v0, v1], T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38))) + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax2_fused_0 in T.serial(T.int64(16), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("sum_exp"): + v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks + ax0) + v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks + ax1) + v2 = T.axis.reduce(T.int64(4096), ax2_fused_0 * T.int64(256) + ax2_fused_1) + T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2], temp_max_shared[v0, v1]) + T.writes(temp_sum_shared[v0, v1]) + with T.init(): + temp_sum_shared[v0, v1] = T.float32(0) + temp_sum_shared[v0, v1] = temp_sum_shared[v0, v1] + T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38)) - temp_max_shared[v0, v1]), T.Cast("float32", T.if_then_else(v1 * T.int64(4096) + v2 < vocab_size, T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), A[v0, v1 * T.int64(4096) + v2] / temperature[v0], A[v0, v1 * T.int64(4096) + v2]), T.float32(-3.4028234663852886e+38)) == temp_max_shared[v0, v1])), T.float32(0)) + for ax2_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("log"): + v0 = T.axis.spatial(batch_size, ax0_ax1_fused % (num_chunks * batch_size) // num_chunks) + v1 = T.axis.spatial(num_chunks, ax0_ax1_fused % num_chunks) + v2 = T.axis.spatial(T.int64(1), ax2_0 * T.int64(256) + ax2_1) + T.where(ax2_0 * T.int64(256) + ax2_1 < T.int64(1)) + T.reads(temperature[v0], temp_sum_shared[v0, v1], temp_max_shared[v0, v1]) + T.writes(chunked_sum[v0, v1], chunked_max[v0, v1]) + chunked_sum[v0, v1] = T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.log(temp_sum_shared[v0, v1]), temp_sum_shared[v0, v1]) + chunked_max[v0, v1] = temp_max_shared[v0, v1] + + @T.prim_func + def compact_kv_copy(var_pages: T.handle, var_copy_length_indptr: T.handle, var_copy_src_dst_pos: T.handle, batch_size: T.int32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + num_pages = T.int32() + pages = T.match_buffer(var_pages, (num_pages, 2, 20, 16, 64), "float16") + copy_length_indptr = T.match_buffer(var_copy_length_indptr, (batch_size + 1,), "int32", offset_factor=1) + total_copy_length = T.int32() + copy_src_dst_pos = T.match_buffer(var_copy_src_dst_pos, (2, total_copy_length), "int32", offset_factor=1) + with T.block("root"): + T.reads() + T.writes() + for bhd_o in T.thread_binding((batch_size * 1280 + 1023) // 1024, thread="blockIdx.x"): + for bhd_i in T.thread_binding(1024, thread="threadIdx.x"): + b: T.int32 = (bhd_o * 1024 + bhd_i) // 1280 + h: T.int32 = (bhd_o * 1024 + bhd_i) // 64 % 20 + d: T.int32 = (bhd_o * 1024 + bhd_i) % 64 + if bhd_o * 1024 + bhd_i < batch_size * 20 * 64: + for i in range(copy_length_indptr[b + 1] - copy_length_indptr[b]): + src_pos: T.int32 = copy_src_dst_pos[0, copy_length_indptr[b] + i] + dst_pos: T.int32 = copy_src_dst_pos[1, copy_length_indptr[b] + i] + pages[dst_pos // 16, 0, h, dst_pos % 16, d] = pages[src_pos // 16, 0, h, src_pos % 16, d] + pages[dst_pos // 16, 1, h, dst_pos % 16, d] = pages[src_pos // 16, 1, h, src_pos % 16, d] + + @T.prim_func + def concatenate(var_reshape710: T.handle, var_reshape711: T.handle, var_reshape712: T.handle, var_T_concat: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + reshape710 = T.match_buffer(var_reshape710, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") + reshape711 = T.match_buffer(var_reshape711, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") + reshape712 = T.match_buffer(var_reshape712, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") + T_concat = T.match_buffer(var_T_concat, (batch_size, T.int64(1), T.int64(60), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_concat"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) + v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(3840)) + T.reads(reshape712[v0, T.int64(0), v1 + T.int64(-40), v2], reshape711[v0, T.int64(0), v1 + T.int64(-20), v2], reshape710[v0, T.int64(0), v1, v2]) + T.writes(T_concat[v0, T.int64(0), v1, v2]) + T_concat[v0, T.int64(0), v1, v2] = T.if_then_else(T.int64(40) <= v1, reshape712[v0, T.int64(0), v1 - T.int64(40), v2], T.if_then_else(T.int64(20) <= v1, reshape711[v0, T.int64(0), v1 + T.int64(-20), v2], reshape710[v0, T.int64(0), v1, v2])) + + @T.prim_func + def concatenate1(var_reshape387: T.handle, var_reshape388: T.handle, var_reshape389: T.handle, var_T_concat: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + reshape387 = T.match_buffer(var_reshape387, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") + reshape388 = T.match_buffer(var_reshape388, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") + reshape389 = T.match_buffer(var_reshape389, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") + T_concat = T.match_buffer(var_T_concat, (T.int64(1), seq_len, T.int64(60), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_concat"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) + v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(3840)) + T.reads(reshape389[T.int64(0), v0, v1 + T.int64(-40), v2], reshape388[T.int64(0), v0, v1 + T.int64(-20), v2], reshape387[T.int64(0), v0, v1, v2]) + T.writes(T_concat[T.int64(0), v0, v1, v2]) + T_concat[T.int64(0), v0, v1, v2] = T.if_then_else(T.int64(40) <= v1, reshape389[T.int64(0), v0, v1 - T.int64(40), v2], T.if_then_else(T.int64(20) <= v1, reshape388[T.int64(0), v0, v1 + T.int64(-20), v2], reshape387[T.int64(0), v0, v1, v2])) + + @T.prim_func + def copy_single_page(var_pages: T.handle, src_page_id: T.int64, tgt_page_id: T.int64, copy_length: T.int64): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + num_pages, page_size = T.int32(), T.int64() + pages = T.match_buffer(var_pages, (num_pages, 2, 20, page_size, 64), "float16") + # with T.block("root"): + for b in T.thread_binding((copy_length * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for t in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("copy"): + vh = T.axis.spatial(20, T.Cast("int32", (b * T.int64(1024) + T.Cast("int64", t)) // (copy_length * T.int64(64)))) + vp = T.axis.spatial(copy_length, (b * T.int64(1024) + T.Cast("int64", t)) % (copy_length * T.int64(64)) // T.int64(64)) + vd = T.axis.spatial(64, T.Cast("int32", (b * T.int64(1024) + T.Cast("int64", t)) % T.int64(64))) + T.reads(pages[src_page_id, 0:2, vh, vp, vd]) + T.writes(pages[tgt_page_id, 0:2, vh, vp, vd]) + pages[tgt_page_id, 0, vh, vp, vd] = pages[src_page_id, 0, vh, vp, vd] + pages[tgt_page_id, 1, vh, vp, vd] = pages[src_page_id, 1, vh, vp, vd] + + @T.prim_func + def cumsum(var_sorted_probs: T.handle, var_lv1: T.handle, var_exclusive_scan_thrust: T.handle): + T.func_attr({"tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int64(), T.int64() + data_buf = T.match_buffer(var_sorted_probs, (batch_size, vocab_size), align=8) + workspace_buf = T.match_buffer(var_lv1, (T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12),), "uint8", align=8) + output_buf = T.match_buffer(var_exclusive_scan_thrust, (batch_size, vocab_size), align=8) + with T.block("exclusive_scan_thrust"): + T.reads() + T.writes() + T.call_packed("tvm.contrib.thrust.sum_scan", T.tvm_stack_make_array(data_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.tvm_stack_make_array(output_buf.data, T.tvm_stack_make_shape(batch_size, vocab_size), 0, 2, T.float32(0), T.int64(0)), T.bool(False), T.tvm_stack_make_array(workspace_buf.data, T.tvm_stack_make_shape(T.int64(8) * (batch_size * vocab_size * T.int64(4)) + T.int64(8388608) + batch_size * vocab_size * T.int64(12)), 0, 1, T.uint8(0), T.int64(0))) + + @T.prim_func + def full(var_result: T.handle, value: T.int32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + batch_size = T.int32(is_size_var=True) + result = T.match_buffer(var_result, (batch_size, 1), "int32") + # with T.block("root"): + for ax0_fused_0 in T.thread_binding((batch_size + 1023) // 1024, thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("block"): + v0 = T.axis.spatial(batch_size, ax0_fused_0 * 1024 + ax0_fused_1) + T.where(ax0_fused_0 * 1024 + ax0_fused_1 < batch_size) + T.reads() + T.writes(result[v0, 0]) + result[v0, 0] = value + + @T.prim_func + def fused_NT_matmul1_add8_gelu2(layer_norm358: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_fc1_weight5: T.Buffer((T.int64(5120), T.int64(1280)), "float16"), model_decoder_layers_0_fc1_bias5: T.Buffer((T.int64(5120),), "float16"), T_multiply_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local") + NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(256), T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local") + NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(64), T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="local") + model_decoder_layers_0_fc1_weight5_local = T.alloc_buffer((T.int64(5120), T.int64(1280)), "float16", scope="local") + layer_norm358_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") + for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(1280), thread="blockIdx.x"): + for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): + for ax2_1 in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax2_2 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + for ax2_3 in T.vectorized(T.int64(1)): + with T.block("layer_norm358_shared"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1 * T.int64(64) + ax2_2 + ax2_3) + T.reads(layer_norm358[v0, v1, v2]) + T.writes(layer_norm358_shared[v0, v1, v2]) + layer_norm358_shared[v0, v1, v2] = layer_norm358[v0, v1, v2] + for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) + v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) + T.reads() + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax0_ax1_fused_0 in range(T.int64(2)): + for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): + with T.block("model_decoder_layers_0_fc1_weight5_local"): + v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1) + v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) + T.reads(model_decoder_layers_0_fc1_weight5[v0, v1]) + T.writes(model_decoder_layers_0_fc1_weight5_local[v0, v1]) + model_decoder_layers_0_fc1_weight5_local[v0, v1] = model_decoder_layers_0_fc1_weight5[v0, v1] + for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(256), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) + v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) + vax1_fused_u_fused_2, vax1_fused_u_fused_0 = T.axis.remap("RR", [ax1_fused_u_fused_2, ax1_fused_u_fused_0]) + T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm358_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused], model_decoder_layers_0_fc1_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused]) + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm358_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused] * model_decoder_layers_0_fc1_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused] + for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax2_fused_2_1 in T.vectorized(T.int64(1)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(64), ax0) + v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads() + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1 in range(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) + v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] + for ax1_fused_2 in range(T.int64(1)): + for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(64), thread="threadIdx.x"): + with T.block("NT_matmul"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(64), ax0) + v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) + with T.init(): + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for ax0_fused_2 in range(T.int64(1)): + with T.block("T_multiply_2"): + v0 = T.axis.spatial(T.int64(5120), u_fused_ax0_fused_fused_0 * T.int64(4) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) + T.reads(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_fc1_bias5[v0]) + T.writes(T_multiply_intermediate[T.int64(0), T.int64(0), v0]) + T_multiply_intermediate[T.int64(0), T.int64(0), v0] = (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc1_bias5[v0]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc1_bias5[v0]) * T.float16(0.70710678118654757)))) * T.float16(0.5)) + + @T.prim_func + def fused_NT_matmul2_add7_add6(gelu130: T.Buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16"), model_decoder_layers_0_fc2_weight5: T.Buffer((T.int64(1280), T.int64(5120)), "float16"), model_decoder_layers_0_fc2_bias5: T.Buffer((T.int64(1280),), "float16"), add1227: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_add_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + model_decoder_layers_0_fc2_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(5120)), "float16", scope="local") + gelu130_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(5120)), "float16", scope="shared") + for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): + for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_0 in T.serial(T.int64(5), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): + for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_3 in T.vectorized(T.int64(2)): + with T.block("gelu130_shared"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(T.int64(5120), ax2_0 * T.int64(1024) + ax2_1 * T.int64(64) + ax2_2 * T.int64(2) + ax2_3) + T.reads(gelu130[v0, v1, v2]) + T.writes(gelu130_shared[v0, v1, v2]) + gelu130_shared[v0, v1, v2] = gelu130[v0, v1, v2] + for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) + T.reads() + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1_fused_u_fused_0 in T.serial(T.int64(20), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax0_ax1_fused_0 in range(T.int64(4)): + for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): + with T.block("model_decoder_layers_0_fc2_weight5_local"): + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) + v1 = T.axis.spatial(T.int64(5120), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) + T.reads(model_decoder_layers_0_fc2_weight5[v0, v1]) + T.writes(model_decoder_layers_0_fc2_weight5_local[v0, v1]) + model_decoder_layers_0_fc2_weight5_local[v0, v1] = model_decoder_layers_0_fc2_weight5[v0, v1] + for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) + vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) + T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], gelu130_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_fc2_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + gelu130_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_fc2_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] + for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax2_fused_2_1 in T.vectorized(T.int64(1)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads() + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1 in range(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] + for ax1_fused_2 in range(T.int64(1)): + for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + with T.block("NT_matmul"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) + with T.init(): + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0_fused_2 in range(T.int64(1)): + with T.block("T_add_1"): + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) + T.reads(add1227[T.int64(0), T.int64(0), v0], NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_fc2_bias5[v0]) + T.writes(T_add_intermediate_1[T.int64(0), T.int64(0), v0]) + T_add_intermediate_1[T.int64(0), T.int64(0), v0] = add1227[T.int64(0), T.int64(0), v0] + (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_fc2_bias5[v0]) + + @T.prim_func + def fused_NT_matmul_add7(layer_norm356: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_q_proj_bias5: T.Buffer((T.int64(1280),), "float16"), T_add_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + model_decoder_layers_0_self_attn_q_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local") + layer_norm356_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") + for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): + for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): + for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_3 in T.vectorized(T.int64(1)): + with T.block("layer_norm356_shared"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3) + T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280)) + T.reads(layer_norm356[v0, v1, v2]) + T.writes(layer_norm356_shared[v0, v1, v2]) + layer_norm356_shared[v0, v1, v2] = layer_norm356[v0, v1, v2] + for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) + T.reads() + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax0_ax1_fused_0 in range(T.int64(4)): + for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): + with T.block("model_decoder_layers_0_self_attn_q_proj_weight5_local"): + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) + v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) + T.reads(model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1]) + T.writes(model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1]) + model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_q_proj_weight5[v0, v1] + for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) + vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) + T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + layer_norm356_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_q_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] + for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax2_fused_2_1 in T.vectorized(T.int64(1)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads() + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1 in range(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] + for ax1_fused_2 in range(T.int64(1)): + for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + with T.block("NT_matmul"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) + with T.init(): + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0_fused_2 in range(T.int64(1)): + with T.block("T_add"): + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) + T.reads(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_self_attn_q_proj_bias5[v0]) + T.writes(T_add_intermediate[T.int64(0), T.int64(0), v0]) + T_add_intermediate[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_self_attn_q_proj_bias5[v0] + + @T.prim_func + def fused_NT_matmul_add7_add6(reshape1361: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_out_proj_weight5: T.Buffer((T.int64(1280), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_out_proj_bias5: T.Buffer((T.int64(1280),), "float16"), add1220: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_add_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + NT_matmul_intermediate_local = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + NT_matmul_intermediate_rf_local = T.alloc_buffer((T.int64(128), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((T.int64(32), T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="local") + model_decoder_layers_0_self_attn_out_proj_weight5_local = T.alloc_buffer((T.int64(1280), T.int64(1280)), "float16", scope="local") + reshape1361_shared = T.alloc_buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16", scope="shared") + for u_fused_ax0_fused_fused_0 in T.thread_binding(T.int64(80), thread="blockIdx.x"): + for u_fused_ax0_fused_fused_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_0 in T.serial(T.int64(3), annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}): + for ax2_1 in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_3 in T.vectorized(T.int64(1)): + with T.block("reshape1361_shared"): + v0, v1 = T.axis.remap("SS", [ax0, ax1]) + v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(512) + ax2_1 * T.int64(32) + ax2_2 + ax2_3) + T.where((ax2_0 * T.int64(16) + ax2_1) * T.int64(32) + ax2_2 + ax2_3 < T.int64(1280)) + T.reads(reshape1361[v0, v1, v2]) + T.writes(reshape1361_shared[v0, v1, v2]) + reshape1361_shared[v0, v1, v2] = reshape1361[v0, v1, v2] + for u_fused_ax0_fused_fused_2_init in range(T.int64(1)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1_init) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2_init) + T.reads() + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1_fused_u_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax0_ax1_fused_0 in range(T.int64(4)): + for ax0_ax1_fused_1 in T.vectorized(T.int64(2)): + with T.block("model_decoder_layers_0_self_attn_out_proj_weight5_local"): + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1) + v1 = T.axis.spatial(T.int64(1280), ax1_fused_u_fused_0 * T.int64(256) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(8) + ax0_ax1_fused_0 * T.int64(2) + ax0_ax1_fused_1) + T.reads(model_decoder_layers_0_self_attn_out_proj_weight5[v0, v1]) + T.writes(model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, v1]) + model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, v1] = model_decoder_layers_0_self_attn_out_proj_weight5[v0, v1] + for u_fused_ax0_fused_fused_2, ax1_fused_u_fused_2 in T.grid(T.int64(1), T.int64(2)): + for ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 in T.vectorized(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused = T.axis.spatial(T.int64(128), ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + ax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + u_fused_ax0_fused_fused_1 + u_fused_ax0_fused_fused_2) + vax1_fused_u_fused_0, vax1_fused_u_fused_2 = T.axis.remap("RR", [ax1_fused_u_fused_0, ax1_fused_u_fused_2]) + T.reads(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0], reshape1361_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)], model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)]) + T.writes(NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused, T.int64(0), T.int64(0), v0] + reshape1361_shared[T.int64(0), T.int64(0), vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] * model_decoder_layers_0_self_attn_out_proj_weight5_local[v0, vax1_fused_u_fused_0 * T.int64(256) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused // T.int64(4) * T.int64(8) + vax1_fused_u_fused_2 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused % T.int64(4)] + for ax2_fused_0_ax2_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax2_fused_2_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + for ax2_fused_2_1 in T.vectorized(T.int64(1)): + with T.block("NT_matmul_rf_init"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.spatial(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads() + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = T.float16(0) + for ax1 in range(T.int64(4)): + with T.block("NT_matmul_rf_update"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1 = T.axis.remap("SR", [ax0, ax1]) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax2_fused_0_ax2_fused_1_fused + ax2_fused_2_0 + ax2_fused_2_1) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0], NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 * T.int64(4) + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_1, T.int64(0), T.int64(0), v0] + for ax1_fused_2 in range(T.int64(1)): + for ax1_fused_0_ax1_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + with T.block("NT_matmul"): + vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0 = T.axis.reduce(T.int64(32), ax0) + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax1_fused_0_ax1_fused_1_fused + ax1_fused_2) + T.reads(NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0]) + T.writes(NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0]) + with T.init(): + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = T.float16(0) + NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] = NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + NT_matmul_intermediate_rf_local_1[vax1_fused_u_fused_1_ax1_fused_u_fused_3_fused_0, T.int64(0), T.int64(0), v0] + for ax0_fused_0_ax0_fused_1_fused in T.thread_binding(T.int64(16), thread="threadIdx.y"): + for ax0_fused_2 in range(T.int64(1)): + with T.block("T_add_1"): + v0 = T.axis.spatial(T.int64(1280), u_fused_ax0_fused_fused_0 * T.int64(16) + ax0_fused_0_ax0_fused_1_fused + ax0_fused_2) + T.reads(add1220[T.int64(0), T.int64(0), v0], NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0], model_decoder_layers_0_self_attn_out_proj_bias5[v0]) + T.writes(T_add_intermediate_1[T.int64(0), T.int64(0), v0]) + T_add_intermediate_1[T.int64(0), T.int64(0), v0] = add1220[T.int64(0), T.int64(0), v0] + (NT_matmul_intermediate_local[T.int64(0), T.int64(0), v0] + model_decoder_layers_0_self_attn_out_proj_bias5[v0]) + + @T.prim_func + def fused_add4_maximum_minimum(p_add4: T.handle, p_lv611: T.handle, p_output0: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + add4 = T.match_buffer(p_add4, (batch_size, T.int64(1500), T.int64(1280)), "float16") + lv611 = T.match_buffer(p_lv611, (batch_size, T.int64(1500), T.int64(1280)), "float16") + T_minimum_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1500), T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_minimum"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) + v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) + v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) + T.reads(add4[v0, v1, v2], lv611[v0, v1, v2]) + T.writes(T_minimum_intermediate[v0, v1, v2]) + T_minimum_intermediate[v0, v1, v2] = T.min(T.max(add4[v0, v1, v2] + lv611[v0, v1, v2], T.float16(-65504)), T.float16(65504)) + + @T.prim_func + def fused_conv1d1_add2_gelu1(p_gelu: T.handle, model_encoder_conv2_weight: T.Buffer((T.int64(1280), T.int64(1280), T.int64(3)), "float16"), lv3: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16"), p_output0: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + gelu = T.match_buffer(p_gelu, (batch_size, T.int64(1280), T.int64(3000)), "float16") + T_multiply_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1280), T.int64(1500)), "float16") + # with T.block("root"): + conv1d_ncw_intermediate_shared = T.alloc_buffer((batch_size, T.int64(1280), T.int64(1500)), "float16", scope="shared") + for ax0_ax1_ax2_fused in T.thread_binding(batch_size * T.int64(1920000), thread="blockIdx.x"): + for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(1), T.int64(1)): + for ax3_ax4_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax3_ax4_fused_0 in T.serial(T.int64(15), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("conv1d_ncw"): + v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(1920000) + ax0) + v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(1920000) // T.int64(1500) + ax1) + v2 = T.axis.spatial(T.int64(1500), ax0_ax1_ax2_fused % T.int64(1500) + ax2) + v3 = T.axis.reduce(T.int64(1280), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) // T.int64(3)) + v4 = T.axis.reduce(T.int64(3), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) % T.int64(3)) + T.reads(gelu[v0, v3, v2 * T.int64(2) + v4 - T.int64(1)], model_encoder_conv2_weight[v1, v3, v4]) + T.writes(conv1d_ncw_intermediate_shared[v0, v1, v2]) + with T.init(): + conv1d_ncw_intermediate_shared[v0, v1, v2] = T.float16(0) + conv1d_ncw_intermediate_shared[v0, v1, v2] = conv1d_ncw_intermediate_shared[v0, v1, v2] + T.if_then_else(T.int64(1) <= v2 * T.int64(2) + v4 and v2 * T.int64(2) + v4 < T.int64(3001), gelu[v0, v3, v2 * T.int64(2) + v4 - T.int64(1)], T.float16(0)) * model_encoder_conv2_weight[v1, v3, v4] + for ax3 in range(T.int64(1)): + for ax4_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax4_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("T_multiply_2"): + v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(1920000)) + v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(1920000) // T.int64(1500)) + v2 = T.axis.spatial(T.int64(1500), ax0_ax1_ax2_fused % T.int64(1500)) + v3 = T.axis.spatial(T.int64(1), ax3) + v4 = T.axis.spatial(T.int64(1), ax4_0 * T.int64(256) + ax4_1) + T.where(ax4_0 * T.int64(256) + ax4_1 < T.int64(1)) + T.reads(conv1d_ncw_intermediate_shared[v0, v1, v2], lv3[T.int64(0), v1, T.int64(0)]) + T.writes(T_multiply_intermediate[v0, v1, v2]) + T_multiply_intermediate[v0, v1, v2] = (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv3[T.int64(0), v1, T.int64(0)]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv3[T.int64(0), v1, T.int64(0)]) * T.float16(0.70710678118654757)))) * T.float16(0.5)) + + @T.prim_func + def fused_conv1d_add1_gelu(p_input_features: T.handle, model_encoder_conv1_weight: T.Buffer((T.int64(1280), T.int64(128), T.int64(3)), "float16"), lv1: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16"), p_output0: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + input_features = T.match_buffer(p_input_features, (batch_size, T.int64(128), T.int64(3000)), "float16") + T_multiply_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1280), T.int64(3000)), "float16") + # with T.block("root"): + conv1d_ncw_intermediate_shared = T.alloc_buffer((batch_size, T.int64(1280), T.int64(3000)), "float16", scope="shared") + for ax0_ax1_ax2_fused in T.thread_binding(batch_size * T.int64(3840000), thread="blockIdx.x"): + for ax0, ax1, ax2 in T.grid(T.int64(1), T.int64(1), T.int64(1)): + for ax3_ax4_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax3_ax4_fused_0 in T.serial(T.int64(2), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("conv1d_ncw"): + v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(3840000) + ax0) + v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(3840000) // T.int64(3000) + ax1) + v2 = T.axis.spatial(T.int64(3000), ax0_ax1_ax2_fused % T.int64(3000) + ax2) + v3 = T.axis.reduce(T.int64(128), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) // T.int64(3)) + v4 = T.axis.reduce(T.int64(3), (ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1) % T.int64(3)) + T.where(ax3_ax4_fused_0 * T.int64(256) + ax3_ax4_fused_1 < T.int64(384)) + T.reads(input_features[v0, v3, v2 + v4 - T.int64(1)], model_encoder_conv1_weight[v1, v3, v4]) + T.writes(conv1d_ncw_intermediate_shared[v0, v1, v2]) + with T.init(): + conv1d_ncw_intermediate_shared[v0, v1, v2] = T.float16(0) + conv1d_ncw_intermediate_shared[v0, v1, v2] = conv1d_ncw_intermediate_shared[v0, v1, v2] + T.if_then_else(T.int64(1) <= v2 + v4 and v2 + v4 < T.int64(3001), input_features[v0, v3, v2 + v4 - T.int64(1)], T.float16(0)) * model_encoder_conv1_weight[v1, v3, v4] + for ax3 in range(T.int64(1)): + for ax4_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax4_0 in T.serial(T.int64(1), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("T_multiply_2"): + v0 = T.axis.spatial(batch_size, ax0_ax1_ax2_fused // T.int64(3840000)) + v1 = T.axis.spatial(T.int64(1280), ax0_ax1_ax2_fused % T.int64(3840000) // T.int64(3000)) + v2 = T.axis.spatial(T.int64(3000), ax0_ax1_ax2_fused % T.int64(3000)) + v3 = T.axis.spatial(T.int64(1), ax3) + v4 = T.axis.spatial(T.int64(1), ax4_0 * T.int64(256) + ax4_1) + T.where(ax4_0 * T.int64(256) + ax4_1 < T.int64(1)) + T.reads(conv1d_ncw_intermediate_shared[v0, v1, v2], lv1[T.int64(0), v1, T.int64(0)]) + T.writes(T_multiply_intermediate[v0, v1, v2]) + T_multiply_intermediate[v0, v1, v2] = (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv1[T.int64(0), v1, T.int64(0)]) * (T.float16(0.5) + T.Cast("float16", T.erf(T.Cast("float32", (conv1d_ncw_intermediate_shared[v0, v1, v2] + lv1[T.int64(0), v1, T.int64(0)]) * T.float16(0.70710678118654757)))) * T.float16(0.5)) + + @T.prim_func + def fused_reshape20_reshape20_add6(take7: T.Buffer((T.int64(1), T.int64(1280)), "float16"), take8: T.Buffer((T.int64(1), T.int64(1280)), "float16"), T_add_intermediate: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_add"): + v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) + T.reads(take7[T.int64(0), v0], take8[T.int64(0), v0]) + T.writes(T_add_intermediate[T.int64(0), T.int64(0), v0]) + T_add_intermediate[T.int64(0), T.int64(0), v0] = take7[T.int64(0), v0] + take8[T.int64(0), v0] + + @T.prim_func + def fused_reshape21_reshape21_reshape21_concatenate2_reshape22(add1221: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), lv1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), add1222: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_reshape_intermediate_1_2_3: T.Buffer((T.int64(1), T.int64(60), T.int64(64)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding(T.int64(4), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape_3"): + v0 = T.axis.spatial(T.int64(60), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(64)) + v1 = T.axis.spatial(T.int64(64), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(64)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < T.int64(3840)) + T.reads(add1222[T.int64(0), T.int64(0), (v0 - T.int64(40)) * T.int64(64) + v1], lv1[T.int64(0), T.int64(0), (v0 + T.int64(-20)) * T.int64(64) + v1], add1221[T.int64(0), T.int64(0), v0 * T.int64(64) + v1]) + T.writes(T_reshape_intermediate_1_2_3[T.int64(0), v0, v1]) + T_reshape_intermediate_1_2_3[T.int64(0), v0, v1] = T.if_then_else(T.int64(40) <= v0, add1222[T.int64(0), T.int64(0), (v0 - T.int64(40)) * T.int64(64) + v1], T.if_then_else(T.int64(20) <= v0, lv1[T.int64(0), T.int64(0), (v0 + T.int64(-20)) * T.int64(64) + v1], add1221[T.int64(0), T.int64(0), v0 * T.int64(64) + v1])) + + @T.prim_func + def fused_reshape21_reshape25(add1225: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), T_reshape_intermediate_1: T.Buffer((T.int64(1), T.int64(20), T.int64(64)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape_1"): + v0 = T.axis.spatial(T.int64(20), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(64)) + v1 = T.axis.spatial(T.int64(64), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(64)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < T.int64(1280)) + T.reads(add1225[T.int64(0), T.int64(0), v0 * T.int64(64) + v1]) + T.writes(T_reshape_intermediate_1[T.int64(0), v0, v1]) + T_reshape_intermediate_1[T.int64(0), v0, v1] = add1225[T.int64(0), T.int64(0), v0 * T.int64(64) + v1] + + @T.prim_func + def fused_reshape23_reshape24(lv265: T.Buffer((T.int64(1), T.int64(20), T.int64(64)), "float16"), T_reshape_intermediate_1: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape_1"): + v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) + T.reads(lv265[T.int64(0), v0 // T.int64(64), v0 % T.int64(64)]) + T.writes(T_reshape_intermediate_1[T.int64(0), T.int64(0), v0]) + T_reshape_intermediate_1[T.int64(0), T.int64(0), v0] = lv265[T.int64(0), v0 // T.int64(64), v0 % T.int64(64)] + + @T.prim_func + def fused_reshape9(packed_params_1: T.Buffer((T.int64(1280),), "float16"), T_reshape_intermediate: T.Buffer((T.int64(1), T.int64(1280), T.int64(1)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) + T.reads(packed_params_1[v0]) + T.writes(T_reshape_intermediate[T.int64(0), v0, T.int64(0)]) + T_reshape_intermediate[T.int64(0), v0, T.int64(0)] = packed_params_1[v0] + + @T.prim_func + def fused_rope(var_qkv: T.handle, var_position_map: T.handle, var_q: T.handle, var_k: T.handle, var_v: T.handle, apply_rope: T.int32): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + qkv = T.match_buffer(var_qkv, (seq_len, 60, 64), "float16") + position_map = T.match_buffer(var_position_map, (seq_len,), "int32", offset_factor=1) + q = T.match_buffer(var_q, (seq_len, 20, 64), "float16") + k = T.match_buffer(var_k, (seq_len, 20, 64), "float16") + v = T.match_buffer(var_v, (seq_len, 20, 64), "float16") + # with T.block("root"): + for iters_0_iters_1_iters_2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for iters_0_iters_1_iters_2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("llama_fused_rope"): + s = T.axis.spatial(seq_len, (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) // T.int64(3840)) + h = T.axis.spatial(60, T.Cast("int32", (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) % T.int64(3840) // T.int64(64))) + d = T.axis.spatial(64, T.Cast("int32", (iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1) % T.int64(64))) + T.where(iters_0_iters_1_iters_2_fused_0 * T.int64(1024) + iters_0_iters_1_iters_2_fused_1 < seq_len * T.int64(3840)) + T.reads(position_map[s], qkv[s, h, d - 32:d - 32 + 65]) + T.writes(q[s, h, d], k[s, h - 20, d], v[s, h - 40, d]) + if h < 20: + q[s, h, d] = T.if_then_else(apply_rope > 0 and d < 64, T.Cast("float16", T.cos(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", qkv[s, h, d]) + T.sin(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(d < 32, qkv[s, h, d + 32] * T.float16(-1), qkv[s, h, d - 32]))), qkv[s, h, d]) + else: + if h < 40: + k[s, h - 20, d] = T.if_then_else(apply_rope > 0 and d < 64, T.Cast("float16", T.cos(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", qkv[s, h, d]) + T.sin(T.Cast("float32", position_map[s]) / T.pow(T.float32(1), T.Cast("float32", d * 2 % 64) / T.float32(64))) * T.Cast("float32", T.if_then_else(d < 32, qkv[s, h, d + 32] * T.float16(-1), qkv[s, h, d - 32]))), qkv[s, h, d]) + else: + v[s, h - 40, d] = qkv[s, h, d] + + @T.prim_func + def fused_transpose_add3(packed_params_4: T.Buffer((T.int64(1500), T.int64(1280)), "float16"), p_gelu1: T.handle, p_output0: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + gelu1 = T.match_buffer(p_gelu1, (batch_size, T.int64(1280), T.int64(1500)), "float16") + T_add_intermediate = T.match_buffer(p_output0, (batch_size, T.int64(1500), T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_add"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) + v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) + v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) + T.reads(gelu1[v0, v2, v1], packed_params_4[v1, v2]) + T.writes(T_add_intermediate[v0, v1, v2]) + T_add_intermediate[v0, v1, v2] = gelu1[v0, v2, v1] + packed_params_4[v1, v2] + + @T.prim_func + def gather_probs(var_src: T.handle, var_indices: T.handle, var_dst: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + m, n = T.int32(is_size_var=True), T.int32(is_size_var=True) + src = T.match_buffer(var_src, (m, n)) + batch_size = T.int32(is_size_var=True) + indices = T.match_buffer(var_indices, (batch_size,), "int32") + dst = T.match_buffer(var_dst, (batch_size, n)) + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * n + 1023) // 1024, thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("gather_2d"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % (n * batch_size) // n) + v1 = T.axis.spatial(n, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % n) + T.where(ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1 < batch_size * n) + T.reads(src[indices[v0], v1], indices[v0]) + T.writes(dst[v0, v1]) + dst[v0, v1] = src[indices[v0], v1] + + @T.prim_func + def get_index_from_sorted(A: T.handle, B: T.handle, C: T.handle, D: T.handle, E: T.handle, F: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + batch, vocab_size = T.int64(), T.int64() + cumsum_sorted = T.match_buffer(A, (batch, vocab_size)) + indices = T.match_buffer(B, (batch, vocab_size), "int32") + renorm_prob = T.match_buffer(C, (batch, 1)) + out_batch = T.int64() + usample = T.match_buffer(D, (out_batch, 1)) + sample_indices = T.match_buffer(E, (out_batch, 1), "int32") + output_index = T.match_buffer(F, (out_batch, 1), "int32") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((out_batch * vocab_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_get_index_from_sorted"): + v0 = T.axis.spatial(out_batch, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size * out_batch) // vocab_size) + v1 = T.axis.spatial(vocab_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < out_batch * vocab_size) + T.reads(usample[v0, T.int64(0)], cumsum_sorted[sample_indices[v0, T.int64(0)], v1 - T.int64(1):v1 - T.int64(1) + T.int64(2)], sample_indices[v0, T.int64(0)], renorm_prob[sample_indices[v0, T.int64(0)], 0], indices[sample_indices[v0, T.int64(0)], T.min(T.int64(0), v1):T.min(T.int64(0), v1) + (v1 + T.int64(1))]) + T.writes(output_index[v0, 0]) + if usample[v0, T.int64(0)] < cumsum_sorted[sample_indices[v0, T.int64(0)], v1] / renorm_prob[sample_indices[v0, T.int64(0)], 0] or v1 + T.int64(1) == vocab_size: + if v1 == T.int64(0): + output_index[v0, 0] = indices[sample_indices[v0, T.int64(0)], 0] + else: + if usample[v0, T.int64(0)] >= cumsum_sorted[sample_indices[v0, T.int64(0)], v1 - T.int64(1)] / renorm_prob[sample_indices[v0, T.int64(0)], 0]: + output_index[v0, 0] = indices[sample_indices[v0, T.int64(0)], v1] + + @T.prim_func + def get_renorm_prob(A: T.handle, B: T.handle, C: T.handle, D: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + batch, vocab_size = T.int64(), T.int64() + cumsum_sorted = T.match_buffer(A, (batch, vocab_size)) + top_p = T.match_buffer(B, (batch, 1)) + top_k = T.match_buffer(C, (batch, 1), "int32") + renorm_prob = T.match_buffer(D, (batch, 1)) + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch * vocab_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_get_renorm_prob"): + v0 = T.axis.spatial(batch, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size * batch) // vocab_size) + v1 = T.axis.spatial(vocab_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch * vocab_size) + T.reads(cumsum_sorted[v0, T.min(T.min(T.int64(0), v1), v1 + T.int64(1)):T.min(T.min(T.int64(0), v1), v1 + T.int64(1)) + (v1 + T.int64(2))], top_p[v0, 0], top_k[v0, 0]) + T.writes(renorm_prob[v0, 0]) + if not (cumsum_sorted[v0, 0] < top_p[v0, 0] and top_k[v0, 0] > 1): + renorm_prob[v0, 0] = cumsum_sorted[v0, 0] + else: + if cumsum_sorted[v0, v1] < top_p[v0, 0] and v1 + T.int64(1) < T.Cast("int64", top_k[v0, 0]): + if v1 + T.int64(1) == vocab_size: + renorm_prob[v0, 0] = cumsum_sorted[v0, v1] + else: + if not (cumsum_sorted[v0, v1 + T.int64(1)] < top_p[v0, 0] and v1 + T.int64(1) + T.int64(1) < T.Cast("int64", top_k[v0, 0])): + renorm_prob[v0, 0] = cumsum_sorted[v0, v1 + T.int64(1)] + + @T.prim_func + def index(var_layer_norm355: T.handle, index: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + layer_norm355 = T.match_buffer(var_layer_norm355, (T.int64(1), seq_len, T.int64(1280)), "float16") + # with T.block("root"): + for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("index"): + v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) + T.reads(layer_norm355[T.int64(0), seq_len - T.int64(1), v0]) + T.writes(index[T.int64(0), T.int64(0), v0]) + index[T.int64(0), T.int64(0), v0] = layer_norm355[T.int64(0), seq_len - T.int64(1), v0] + + @T.prim_func + def layer_norm(var_add578: T.handle, model_decoder_layers_0_self_attn_layer_norm_weight3: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias3: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + add578 = T.match_buffer(var_add578, (batch_size, T.int64(1), T.int64(1280)), "float16") + T_layer_norm = T.match_buffer(var_T_layer_norm, (batch_size, T.int64(1), T.int64(1280)), "float16") + # with T.block("root"): + add578_red_temp_v0_shared = T.alloc_buffer((batch_size, T.int64(1)), scope="shared") + add578_red_temp_v1_shared = T.alloc_buffer((batch_size, T.int64(1)), scope="shared") + for ax0_fused in T.thread_binding(batch_size, thread="blockIdx.x"): + for ax0 in range(T.int64(1)): + for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("add578_red_temp"): + v0 = T.axis.spatial(batch_size, ax0_fused + ax0) + v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1) + T.reads(add578[v0, T.int64(0), v1]) + T.writes(add578_red_temp_v0_shared[v0, T.int64(0)], add578_red_temp_v1_shared[v0, T.int64(0)]) + with T.init(): + add578_red_temp_v0_shared[v0, T.int64(0)] = T.float32(0) + add578_red_temp_v1_shared[v0, T.int64(0)] = T.float32(0) + v_add578_red_temp_v0: T.float32 = add578_red_temp_v0_shared[v0, T.int64(0)] + T.Cast("float32", add578[v0, T.int64(0), v1]) + v_add578_red_temp_v1: T.float32 = add578_red_temp_v1_shared[v0, T.int64(0)] + T.Cast("float32", add578[v0, T.int64(0), v1]) * T.Cast("float32", add578[v0, T.int64(0), v1]) + add578_red_temp_v0_shared[v0, T.int64(0)] = v_add578_red_temp_v0 + add578_red_temp_v1_shared[v0, T.int64(0)] = v_add578_red_temp_v1 + for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("T_layer_norm"): + v0 = T.axis.spatial(batch_size, ax0_fused) + v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1) + T.reads(add578[v0, T.int64(0), v1], add578_red_temp_v0_shared[v0, T.int64(0)], add578_red_temp_v1_shared[v0, T.int64(0)], model_decoder_layers_0_self_attn_layer_norm_weight3[v1], model_decoder_layers_0_self_attn_layer_norm_bias3[v1]) + T.writes(T_layer_norm[v0, T.int64(0), v1]) + T_layer_norm[v0, T.int64(0), v1] = T.Cast("float16", (T.Cast("float32", add578[v0, T.int64(0), v1]) - add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004)) * T.rsqrt(add578_red_temp_v1_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004) - add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004) * (add578_red_temp_v0_shared[v0, T.int64(0)] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight3[v1] + model_decoder_layers_0_self_attn_layer_norm_bias3[v1] + + @T.prim_func + def layer_norm1(var_add: T.handle, model_encoder_layers_0_self_attn_layer_norm_weight: T.Buffer((T.int64(1280),), "float16"), model_encoder_layers_0_self_attn_layer_norm_bias: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + add = T.match_buffer(var_add, (batch_size, T.int64(1500), T.int64(1280)), "float16") + T_layer_norm = T.match_buffer(var_T_layer_norm, (batch_size, T.int64(1500), T.int64(1280)), "float16") + # with T.block("root"): + add_red_temp_v0_shared = T.alloc_buffer((batch_size, T.int64(1500)), scope="shared") + add_red_temp_v1_shared = T.alloc_buffer((batch_size, T.int64(1500)), scope="shared") + for ax0_ax1_fused in T.thread_binding(batch_size * T.int64(1500), thread="blockIdx.x"): + for ax0, ax1 in T.grid(T.int64(1), T.int64(1)): + for ax2_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax2_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("add_red_temp"): + v0 = T.axis.spatial(batch_size, ax0_ax1_fused // T.int64(1500) + ax0) + v1 = T.axis.spatial(T.int64(1500), ax0_ax1_fused % T.int64(1500) + ax1) + v2 = T.axis.reduce(T.int64(1280), ax2_fused_0 * T.int64(256) + ax2_fused_1) + T.reads(add[v0, v1, v2]) + T.writes(add_red_temp_v0_shared[v0, v1], add_red_temp_v1_shared[v0, v1]) + with T.init(): + add_red_temp_v0_shared[v0, v1] = T.float32(0) + add_red_temp_v1_shared[v0, v1] = T.float32(0) + v_add_red_temp_v0: T.float32 = add_red_temp_v0_shared[v0, v1] + T.Cast("float32", add[v0, v1, v2]) + v_add_red_temp_v1: T.float32 = add_red_temp_v1_shared[v0, v1] + T.Cast("float32", add[v0, v1, v2]) * T.Cast("float32", add[v0, v1, v2]) + add_red_temp_v0_shared[v0, v1] = v_add_red_temp_v0 + add_red_temp_v1_shared[v0, v1] = v_add_red_temp_v1 + for ax2_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax2_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("T_layer_norm"): + v0 = T.axis.spatial(batch_size, ax0_ax1_fused // T.int64(1500)) + v1 = T.axis.spatial(T.int64(1500), ax0_ax1_fused % T.int64(1500)) + v2 = T.axis.spatial(T.int64(1280), ax2_0 * T.int64(256) + ax2_1) + T.reads(add[v0, v1, v2], add_red_temp_v0_shared[v0, v1], add_red_temp_v1_shared[v0, v1], model_encoder_layers_0_self_attn_layer_norm_weight[v2], model_encoder_layers_0_self_attn_layer_norm_bias[v2]) + T.writes(T_layer_norm[v0, v1, v2]) + T_layer_norm[v0, v1, v2] = T.Cast("float16", (T.Cast("float32", add[v0, v1, v2]) - add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004)) * T.rsqrt(add_red_temp_v1_shared[v0, v1] * T.float32(0.00078125000000000004) - add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004) * (add_red_temp_v0_shared[v0, v1] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_encoder_layers_0_self_attn_layer_norm_weight[v2] + model_encoder_layers_0_self_attn_layer_norm_bias[v2] + + @T.prim_func + def layer_norm2(var_add257: T.handle, model_decoder_layers_0_self_attn_layer_norm_weight2: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias2: T.Buffer((T.int64(1280),), "float16"), var_T_layer_norm: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + add257 = T.match_buffer(var_add257, (T.int64(1), seq_len, T.int64(1280)), "float16") + T_layer_norm = T.match_buffer(var_T_layer_norm, (T.int64(1), seq_len, T.int64(1280)), "float16") + # with T.block("root"): + add257_red_temp_v0_shared = T.alloc_buffer((T.int64(1), seq_len), scope="shared") + add257_red_temp_v1_shared = T.alloc_buffer((T.int64(1), seq_len), scope="shared") + for ax0_fused in T.thread_binding(seq_len, thread="blockIdx.x"): + for ax0 in range(T.int64(1)): + for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("add257_red_temp"): + v0 = T.axis.spatial(seq_len, ax0_fused + ax0) + v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1) + T.reads(add257[T.int64(0), v0, v1]) + T.writes(add257_red_temp_v0_shared[T.int64(0), v0], add257_red_temp_v1_shared[T.int64(0), v0]) + with T.init(): + add257_red_temp_v0_shared[T.int64(0), v0] = T.float32(0) + add257_red_temp_v1_shared[T.int64(0), v0] = T.float32(0) + v_add257_red_temp_v0: T.float32 = add257_red_temp_v0_shared[T.int64(0), v0] + T.Cast("float32", add257[T.int64(0), v0, v1]) + v_add257_red_temp_v1: T.float32 = add257_red_temp_v1_shared[T.int64(0), v0] + T.Cast("float32", add257[T.int64(0), v0, v1]) * T.Cast("float32", add257[T.int64(0), v0, v1]) + add257_red_temp_v0_shared[T.int64(0), v0] = v_add257_red_temp_v0 + add257_red_temp_v1_shared[T.int64(0), v0] = v_add257_red_temp_v1 + for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("T_layer_norm"): + v0 = T.axis.spatial(seq_len, ax0_fused) + v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1) + T.reads(add257[T.int64(0), v0, v1], add257_red_temp_v0_shared[T.int64(0), v0], add257_red_temp_v1_shared[T.int64(0), v0], model_decoder_layers_0_self_attn_layer_norm_weight2[v1], model_decoder_layers_0_self_attn_layer_norm_bias2[v1]) + T.writes(T_layer_norm[T.int64(0), v0, v1]) + T_layer_norm[T.int64(0), v0, v1] = T.Cast("float16", (T.Cast("float32", add257[T.int64(0), v0, v1]) - add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004)) * T.rsqrt(add257_red_temp_v1_shared[T.int64(0), v0] * T.float32(0.00078125000000000004) - add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004) * (add257_red_temp_v0_shared[T.int64(0), v0] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight2[v1] + model_decoder_layers_0_self_attn_layer_norm_bias2[v1] + + @T.prim_func + def layer_norm3(add1220: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16"), model_decoder_layers_0_self_attn_layer_norm_weight5: T.Buffer((T.int64(1280),), "float16"), model_decoder_layers_0_self_attn_layer_norm_bias5: T.Buffer((T.int64(1280),), "float16"), T_layer_norm: T.Buffer((T.int64(1), T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + add1220_red_temp_v0_shared = T.alloc_buffer((T.int64(1), T.int64(1)), scope="shared") + add1220_red_temp_v1_shared = T.alloc_buffer((T.int64(1), T.int64(1)), scope="shared") + for ax0_fused in T.thread_binding(T.int64(1), thread="blockIdx.x"): + for ax0 in range(T.int64(1)): + for ax1_fused_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax1_fused_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("add1220_red_temp"): + v0 = T.axis.spatial(T.int64(1), ax0) + v1 = T.axis.reduce(T.int64(1280), ax1_fused_0 * T.int64(256) + ax1_fused_1) + T.reads(add1220[T.int64(0), T.int64(0), v1]) + T.writes(add1220_red_temp_v0_shared[T.int64(0), T.int64(0)], add1220_red_temp_v1_shared[T.int64(0), T.int64(0)]) + with T.init(): + add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] = T.float32(0) + add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] = T.float32(0) + v_add1220_red_temp_v0: T.float32 = add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] + T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) + v_add1220_red_temp_v1: T.float32 = add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] + T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) * T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) + add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] = v_add1220_red_temp_v0 + add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] = v_add1220_red_temp_v1 + for ax1_1 in T.thread_binding(T.int64(256), thread="threadIdx.x"): + for ax1_0 in T.serial(T.int64(5), annotations={"pragma_auto_unroll_max_step": 256, "pragma_unroll_explicit": 1}): + with T.block("T_layer_norm"): + v0 = T.axis.spatial(T.int64(1), T.int64(0)) + v1 = T.axis.spatial(T.int64(1280), ax1_0 * T.int64(256) + ax1_1) + T.reads(add1220[T.int64(0), T.int64(0), v1], add1220_red_temp_v0_shared[T.int64(0), T.int64(0)], add1220_red_temp_v1_shared[T.int64(0), T.int64(0)], model_decoder_layers_0_self_attn_layer_norm_weight5[v1], model_decoder_layers_0_self_attn_layer_norm_bias5[v1]) + T.writes(T_layer_norm[T.int64(0), T.int64(0), v1]) + T_layer_norm[T.int64(0), T.int64(0), v1] = T.Cast("float16", (T.Cast("float32", add1220[T.int64(0), T.int64(0), v1]) - add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004)) * T.rsqrt(add1220_red_temp_v1_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004) - add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004) * (add1220_red_temp_v0_shared[T.int64(0), T.int64(0)] * T.float32(0.00078125000000000004)) + T.float32(1.0000000000000001e-05))) * model_decoder_layers_0_self_attn_layer_norm_weight5[v1] + model_decoder_layers_0_self_attn_layer_norm_bias5[v1] + + @T.prim_func + def merge_state_inplace(v: T.handle, s: T.handle, v_other: T.handle, s_other: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + N, H, D = T.int32(is_size_var=True), T.int32(is_size_var=True), T.int32(is_size_var=True) + V = T.match_buffer(v, (N, H, D), "float16") + S = T.match_buffer(s, (N, H)) + V_other = T.match_buffer(v_other, (N, H, D), "float16") + S_other = T.match_buffer(s_other, (N, H)) + # with T.block("root"): + for bx in T.thread_binding(N, thread="blockIdx.x"): + for by in T.thread_binding(1, thread="blockIdx.y"): + for ty in T.thread_binding(20, thread="threadIdx.y"): + for tx in T.thread_binding(16, thread="threadIdx.x"): + with T.block("merge"): + T.reads(S[bx, ty + by * 20], S_other[bx, ty + by * 20], V[bx, ty + by * 20, tx * 4:tx * 4 + 4], V_other[bx, ty + by * 20, tx * 4:tx * 4 + 4]) + T.writes(V[bx, ty + by * 20, tx * 4:tx * 4 + 4], S[bx, ty + by * 20]) + s_val = T.alloc_buffer((1,), scope="local") + s_other_val = T.alloc_buffer((1,), scope="local") + s_max = T.alloc_buffer((1,), scope="local") + scale = T.alloc_buffer((1,), scope="local") + other_scale = T.alloc_buffer((1,), scope="local") + v_vec = T.alloc_buffer((4,), "float16", scope="local") + v_other_vec = T.alloc_buffer((4,), "float16", scope="local") + s_val[0] = S[bx, ty + by * 20] + s_other_val[0] = S_other[bx, ty + by * 20] + s_max[0] = T.max(s_val[0], s_other_val[0]) + s_val[0] = T.exp2(s_val[0] - s_max[0]) + s_other_val[0] = T.exp2(s_other_val[0] - s_max[0]) + scale[0] = s_val[0] / (s_val[0] + s_other_val[0]) + other_scale[0] = s_other_val[0] / (s_val[0] + s_other_val[0]) + for vec in T.vectorized(4): + v_vec[vec] = V[bx, ty + by * 20, tx * 4 + vec] + for vec in T.vectorized(4): + v_other_vec[vec] = V_other[bx, ty + by * 20, tx * 4 + vec] + for vec in range(4): + v_vec[vec] = T.Cast("float16", T.Cast("float32", v_vec[vec]) * scale[0] + T.Cast("float32", v_other_vec[vec]) * other_scale[0]) + for vec in T.vectorized(4): + V[bx, ty + by * 20, tx * 4 + vec] = v_vec[vec] + S[bx, ty + by * 20] = T.log2(s_val[0] + s_other_val[0]) + s_max[0] + + @T.prim_func + def parallel_sampling_from_prob(var_prob: T.handle, var_uniform_samples: T.handle, var_row_indices: T.handle, var_sampled_token_ids: T.handle): + T.func_attr({"tir.is_scheduled": 1}) + n, vocab_size = T.int64(), T.int64() + prob = T.match_buffer(var_prob, (n, vocab_size)) + batch_size = T.int64() + uniform_samples = T.match_buffer(var_uniform_samples, (batch_size, 1)) + row_indices = T.match_buffer(var_row_indices, (batch_size, 1), "int32") + token_ids = T.match_buffer(var_sampled_token_ids, (batch_size, 1), "int32") + # with T.block("root"): + aggregate = T.alloc_buffer((), scope="local") + sample_id_local = T.alloc_buffer((), "int32", scope="local") + step_iter = T.alloc_buffer((), "int32", scope="local") + for bx in T.thread_binding(batch_size, thread="blockIdx.x"): + row_idx: T.int32 = row_indices[bx, 0] + for ty in T.thread_binding(T.int64(4), thread="threadIdx.y"): + for tx in T.thread_binding(T.int64(32), thread="threadIdx.x"): + u: T.float32 = uniform_samples[bx, 0] + aggregate[()] = T.Cast("float32", 0) + step_iter[()] = 0 + while T.tvm_thread_invariant((step_iter[()] == 0 or aggregate[()] < u - T.float32(9.9999999999999995e-07)) and T.Cast("int64", step_iter[()]) < (vocab_size + T.int64(512) - T.int64(1)) // T.int64(512)): + with T.block(""): + T.reads(step_iter[()], prob[row_idx, T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4):T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + T.int64(4)], aggregate[()]) + T.writes(sample_id_local[()], aggregate[()]) + prob_gt_threshold = T.alloc_buffer((T.int64(4),), scope="local") + cumsum = T.alloc_buffer((T.int64(512),), scope="shared") + greater_than_u = T.alloc_buffer((T.int64(4),), "bool", scope="local") + mask = T.alloc_buffer((T.int64(4),), "bool", scope="local") + valid = T.alloc_buffer((T.int64(4),), "bool", scope="local") + indices = T.alloc_buffer((T.int64(4),), "int32", scope="local") + step_aggregate = T.alloc_buffer((), scope="local") + for v in T.unroll(T.int64(4)): + idx: T.int64 = T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + v + prob_local: T.float32 = T.if_then_else(idx < vocab_size, prob[row_idx, idx], T.Cast("float32", 0)) + prob_gt_threshold[v] = T.if_then_else(prob_local > T.float32(0), prob_local, T.Cast("float32", 0)) + valid[v] = prob_local > T.float32(0) and idx < vocab_size + with T.block(""): + T.reads(prob_gt_threshold[T.int64(0):T.int64(4)]) + T.writes(step_aggregate[()]) + local_sum = T.alloc_buffer((), scope="local") + shared_buf = T.alloc_buffer((T.int64(128),), scope="shared") + idx: T.int64 = ty * T.int64(32) + tx + local_sum[()] = T.Cast("float32", 0) + for i in T.unroll(T.int64(4)): + local_sum[()] = local_sum[()] + prob_gt_threshold[i] + shared_buf[idx] = local_sum[()] + for i in T.unroll(T.int64(7)): + if idx % T.shift_left(T.int64(1), i + T.int64(1)) == T.int64(0): + shared_buf[idx] = shared_buf[idx] + shared_buf[idx + T.shift_left(T.int64(1), i)] + step_aggregate[()] = shared_buf[0] + if T.tvm_thread_invariant(aggregate[()] + step_aggregate[()] >= u - T.float32(9.9999999999999995e-07)): + for i in T.unroll(T.int64(1), T.int64(4)): + prob_gt_threshold[i] = prob_gt_threshold[i] + prob_gt_threshold[i - T.int64(1)] + for i in T.vectorized(T.int64(4)): + cumsum[ty * T.int64(128) + tx * T.int64(4) + i] = prob_gt_threshold[i] + for i in T.unroll(T.int64(5)): + for j in T.vectorized(T.int64(4)): + idx: T.int64 = ty * T.int64(128) + tx * T.int64(4) + if tx >= T.shift_left(T.int64(1), i): + cumsum[idx + j] = cumsum[idx + j] + cumsum[idx - T.shift_left(T.int64(1), i) * T.int64(4) + T.int64(4) - T.int64(1)] + for i in T.unroll(T.int64(1), T.int64(4)): + for j in T.vectorized(T.int64(4)): + if ty == T.int64(0): + idx: T.int64 = i * T.int64(128) + tx * T.int64(4) + cumsum[idx + j] = cumsum[idx + j] + cumsum[i * T.int64(128) - T.int64(1)] + for v in T.unroll(T.int64(4)): + greater_than_u[v] = cumsum[ty * T.int64(128) + tx * T.int64(4) + v] + aggregate[()] >= u - T.float32(9.9999999999999995e-07) + with T.block(""): + T.reads(greater_than_u[T.int64(0):T.int64(4)]) + T.writes(mask[T.int64(0):T.int64(4)]) + shared_buf = T.alloc_buffer((T.int64(128),), "bool", scope="shared") + tx_idx: T.int64 = ty * T.int64(32) + tx + shared_buf[tx_idx] = greater_than_u[T.int64(3)] + mask[0] = T.if_then_else(tx_idx != T.int64(0), T.Cast("int8", greater_than_u[0]) != T.Cast("int8", shared_buf[tx_idx - T.int64(1)]), greater_than_u[0]) + for i in T.unroll(T.int64(1), T.int64(4)): + mask[i] = T.Cast("int8", greater_than_u[i]) != T.Cast("int8", greater_than_u[i - T.int64(1)]) + for v in T.unroll(T.int64(4)): + mask[v] = mask[v] and valid[v] + indices[v] = T.Cast("int32", T.Cast("int64", step_iter[()]) * T.int64(512) + ty * T.int64(128) + tx * T.int64(4) + v) + with T.block(""): + T.reads(mask[T.int64(0):T.int64(4)], indices[T.int64(0):T.int64(4)]) + T.writes(sample_id_local[()]) + local_sum = T.alloc_buffer((), "int32", scope="local") + shared_buf = T.alloc_buffer((T.int64(128),), "int32", scope="shared") + idx: T.int64 = ty * T.int64(32) + tx + local_sum[()] = T.Cast("int32", vocab_size - T.int64(1)) + for i in T.unroll(T.int64(4)): + if mask[i]: + local_sum[()] = T.min(local_sum[()], indices[i]) + shared_buf[idx] = local_sum[()] + for i in T.unroll(T.int64(7)): + if idx % T.shift_left(T.int64(1), i + T.int64(1)) == T.int64(0): + shared_buf[idx] = T.min(shared_buf[idx], shared_buf[idx + T.shift_left(T.int64(1), i)]) + sample_id_local[()] = shared_buf[0] + aggregate[()] = aggregate[()] + step_aggregate[()] + step_iter[()] = step_iter[()] + 1 + if tx == T.int64(0) and ty == T.int64(0): + token_ids[bx, 0] = sample_id_local[()] + + @T.prim_func + def reshape(var_lv: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + lv = T.match_buffer(var_lv, (batch_size, T.int64(1500), T.int64(1280)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_ax3_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): + for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) // T.int64(1920000)) + v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1920000) // T.int64(1280)) + v2 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1280) // T.int64(64)) + v3 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(64)) + T.reads(lv[v0, v1, v2 * T.int64(64) + v3]) + T.writes(T_reshape[v0, v1, v2, v3]) + T_reshape[v0, v1, v2, v3] = lv[v0, v1, v2 * T.int64(64) + v3] + + @T.prim_func + def reshape1(var_reshape256: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + reshape256 = T.match_buffer(var_reshape256, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size * T.int64(1500), T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size * T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.reads(reshape256[v0 // T.int64(1500), v0 % T.int64(1500), v1, v2]) + T.writes(T_reshape[v0, v1, v2]) + T_reshape[v0, v1, v2] = reshape256[v0 // T.int64(1500), v0 % T.int64(1500), v1, v2] + + @T.prim_func + def reshape10(var_lv4: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + lv4 = T.match_buffer(var_lv4, (batch_size * T.int64(1500), T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_ax3_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): + for ax0_ax1_ax2_ax3_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) // T.int64(1920000)) + v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1920000) // T.int64(1280)) + v2 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(1280) // T.int64(64)) + v3 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_ax3_fused_0 * T.int64(1024) + ax0_ax1_ax2_ax3_fused_1) % T.int64(64)) + T.reads(lv4[v0 * T.int64(1500) + v1, v2, v3]) + T.writes(T_reshape[v0, v1, v2, v3]) + T_reshape[v0, v1, v2, v3] = lv4[v0 * T.int64(1500) + v1, v2, v3] + + @T.prim_func + def reshape11(var_reshape6: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + reshape6 = T.match_buffer(var_reshape6, (batch_size, T.int64(1500), T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1500), T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding(batch_size * T.int64(1875), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1920000)) + v1 = T.axis.spatial(T.int64(1500), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1920000) // T.int64(1280)) + v2 = T.axis.spatial(T.int64(1280), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280)) + T.reads(reshape6[v0, v1, v2 // T.int64(64), v2 % T.int64(64)]) + T.writes(T_reshape[v0, v1, v2]) + T_reshape[v0, v1, v2] = reshape6[v0, v1, v2 // T.int64(64), v2 % T.int64(64)] + + @T.prim_func + def reshape12(var_input_ids: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + input_ids = T.match_buffer(var_input_ids, (T.int64(1), seq_len), "int32") + T_reshape = T.match_buffer(var_T_reshape, (seq_len,), "int32") + # with T.block("root"): + for ax0_fused_0 in T.thread_binding((seq_len + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(seq_len, ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < seq_len) + T.reads(input_ids[T.int64(0), v0]) + T.writes(T_reshape[v0]) + T_reshape[v0] = input_ids[T.int64(0), v0] + + @T.prim_func + def reshape13(var_take: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + take = T.match_buffer(var_take, (seq_len, T.int64(1280)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280)) + T.reads(take[v0, v1]) + T.writes(T_reshape[T.int64(0), v0, v1]) + T_reshape[T.int64(0), v0, v1] = take[v0, v1] + + @T.prim_func + def reshape14(var_lv416: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + lv416 = T.match_buffer(var_lv416, (T.int64(1), seq_len, T.int64(1280)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280)) + T.reads(lv416[T.int64(0), v0, v1 * T.int64(64) + v2]) + T.writes(T_reshape[T.int64(0), v0, v1, v2]) + T_reshape[T.int64(0), v0, v1, v2] = lv416[T.int64(0), v0, v1 * T.int64(64) + v2] + + @T.prim_func + def reshape15(var_concat: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + concat = T.match_buffer(var_concat, (T.int64(1), seq_len, T.int64(60), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (seq_len, T.int64(60), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) + v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(3840)) + T.reads(concat[T.int64(0), v0, v1, v2]) + T.writes(T_reshape[v0, v1, v2]) + T_reshape[v0, v1, v2] = concat[T.int64(0), v0, v1, v2] + + @T.prim_func + def reshape16(var_lv69: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + lv69 = T.match_buffer(var_lv69, (seq_len, T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280)) + T.reads(lv69[v0, v1, v2]) + T.writes(T_reshape[T.int64(0), v0, v1, v2]) + T_reshape[T.int64(0), v0, v1, v2] = lv69[v0, v1, v2] + + @T.prim_func + def reshape17(var_reshape391: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + reshape391 = T.match_buffer(var_reshape391, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (T.int64(1), seq_len, T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < seq_len * T.int64(1280)) + T.reads(reshape391[T.int64(0), v0, v1 // T.int64(64), v1 % T.int64(64)]) + T.writes(T_reshape[T.int64(0), v0, v1]) + T_reshape[T.int64(0), v0, v1] = reshape391[T.int64(0), v0, v1 // T.int64(64), v1 % T.int64(64)] + + @T.prim_func + def reshape18(var_reshape393: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + reshape393 = T.match_buffer(var_reshape393, (T.int64(1), seq_len, T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (seq_len, T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((seq_len * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(seq_len, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < seq_len * T.int64(1280)) + T.reads(reshape393[T.int64(0), v0, v1, v2]) + T.writes(T_reshape[v0, v1, v2]) + T_reshape[v0, v1, v2] = reshape393[T.int64(0), v0, v1, v2] + + @T.prim_func + def reshape19(input_ids: T.Buffer((T.int64(1), T.int64(1)), "int32"), T_reshape: T.Buffer((T.int64(1),), "int32")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_fused_0 in T.thread_binding(T.int64(1), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(T.int64(1), T.int64(0)) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1)) + T.reads(input_ids[T.int64(0), T.int64(0)]) + T.writes(T_reshape[T.int64(0)]) + T_reshape[T.int64(0)] = input_ids[T.int64(0), T.int64(0)] + + @T.prim_func + def reshape2(var_input_ids: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + input_ids = T.match_buffer(var_input_ids, (batch_size, T.int64(1)), "int32") + T_reshape = T.match_buffer(var_T_reshape, (batch_size,), "int32") + # with T.block("root"): + for ax0_fused_0 in T.thread_binding((batch_size + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < batch_size) + T.reads(input_ids[v0, T.int64(0)]) + T.writes(T_reshape[v0]) + T_reshape[v0] = input_ids[v0, T.int64(0)] + + @T.prim_func + def reshape3(var_take3: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + take3 = T.match_buffer(var_take3, (batch_size, T.int64(1280)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) + T.reads(take3[v0, v1]) + T.writes(T_reshape[v0, T.int64(0), v1]) + T_reshape[v0, T.int64(0), v1] = take3[v0, v1] + + @T.prim_func + def reshape4(var_lv224: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + lv224 = T.match_buffer(var_lv224, (batch_size, T.int64(1), T.int64(1280)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280)) + T.reads(lv224[v0, T.int64(0), v1 * T.int64(64) + v2]) + T.writes(T_reshape[v0, T.int64(0), v1, v2]) + T_reshape[v0, T.int64(0), v1, v2] = lv224[v0, T.int64(0), v1 * T.int64(64) + v2] + + @T.prim_func + def reshape5(var_concat32: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + concat32 = T.match_buffer(var_concat32, (batch_size, T.int64(1), T.int64(60), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(60), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(3840) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(3840)) + v1 = T.axis.spatial(T.int64(60), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(3840) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(3840)) + T.reads(concat32[v0, T.int64(0), v1, v2]) + T.writes(T_reshape[v0, v1, v2]) + T_reshape[v0, v1, v2] = concat32[v0, T.int64(0), v1, v2] + + @T.prim_func + def reshape6(var_lv134: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + lv134 = T.match_buffer(var_lv134, (batch_size, T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280)) + T.reads(lv134[v0, v1, v2]) + T.writes(T_reshape[v0, T.int64(0), v1, v2]) + T_reshape[v0, T.int64(0), v1, v2] = lv134[v0, v1, v2] + + @T.prim_func + def reshape7(var_reshape714: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + reshape714 = T.match_buffer(var_reshape714, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(1), T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) + T.reads(reshape714[v0, T.int64(0), v1 // T.int64(64), v1 % T.int64(64)]) + T.writes(T_reshape[v0, T.int64(0), v1]) + T_reshape[v0, T.int64(0), v1] = reshape714[v0, T.int64(0), v1 // T.int64(64), v1 % T.int64(64)] + + @T.prim_func + def reshape8(var_reshape716: T.handle, var_T_reshape: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + reshape716 = T.match_buffer(var_reshape716, (batch_size, T.int64(1), T.int64(20), T.int64(64)), "float16") + T_reshape = T.match_buffer(var_T_reshape, (batch_size, T.int64(20), T.int64(64)), "float16") + # with T.block("root"): + for ax0_ax1_ax2_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_ax2_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_reshape"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(20), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(1280) // T.int64(64)) + v2 = T.axis.spatial(T.int64(64), (ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1) % T.int64(64)) + T.where(ax0_ax1_ax2_fused_0 * T.int64(1024) + ax0_ax1_ax2_fused_1 < batch_size * T.int64(1280)) + T.reads(reshape716[v0, T.int64(0), v1, v2]) + T.writes(T_reshape[v0, v1, v2]) + T_reshape[v0, v1, v2] = reshape716[v0, T.int64(0), v1, v2] + + @T.prim_func + def sampler_take_probs_tir(var_unsorted_probs: T.handle, var_sorted_indices: T.handle, var_sample_indices: T.handle, var_sampling_results: T.handle, var_top_prob_offsets: T.handle, var_sampled_values: T.handle, var_top_prob_probs: T.handle, var_top_prob_indices: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1}) + batch_size, vocab_size = T.int32(is_size_var=True), T.int32(is_size_var=True) + unsorted_probs = T.match_buffer(var_unsorted_probs, (batch_size, vocab_size)) + sorted_indices = T.match_buffer(var_sorted_indices, (batch_size, vocab_size), "int32") + num_samples = T.int32(is_size_var=True) + sample_indices = T.match_buffer(var_sample_indices, (num_samples,), "int32") + sampling_results = T.match_buffer(var_sampling_results, (num_samples,), "int32") + num_positions = T.int32(is_size_var=True) + top_prob_offsets = T.match_buffer(var_top_prob_offsets, (num_positions,), "int32") + sampled_values = T.match_buffer(var_sampled_values, (num_samples,)) + top_prob_probs = T.match_buffer(var_top_prob_probs, (num_positions,)) + top_prob_indices = T.match_buffer(var_top_prob_indices, (num_positions,), "int32") + # with T.block("root"): + for ax0_fused_0 in T.thread_binding((num_positions + num_samples + 1023) // 1024, thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("block"): + v0 = T.axis.spatial(num_positions + num_samples, ax0_fused_0 * 1024 + ax0_fused_1) + T.where(ax0_fused_0 * 1024 + ax0_fused_1 < num_positions + num_samples) + T.reads(top_prob_offsets[v0], sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], unsorted_probs[T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 + (0 - num_positions)]):T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 + (0 - num_positions)]) + (T.max(top_prob_offsets[v0] // vocab_size, sample_indices[v0 - num_positions]) + 1 - T.min(top_prob_offsets[v0] // vocab_size, sample_indices[v0 - num_positions])), T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 + (0 - num_positions)]):T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 + (0 - num_positions)]) + (T.max(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 - num_positions]) + 1 - T.min(sorted_indices[top_prob_offsets[v0] // vocab_size, top_prob_offsets[v0] % vocab_size], sampling_results[v0 - num_positions]))], sample_indices[v0 + (0 - num_positions)], sampling_results[v0 + (0 - num_positions)]) + T.writes(top_prob_indices[v0], top_prob_probs[v0], sampled_values[v0 + (0 - num_positions)]) + if v0 < num_positions: + row: T.int32 = top_prob_offsets[v0] // vocab_size + col: T.int32 = top_prob_offsets[v0] % vocab_size + top_prob_indices[v0] = sorted_indices[row, col] + top_prob_probs[v0] = unsorted_probs[row, sorted_indices[row, col]] + else: + vj: T.int32 = v0 - num_positions + sampled_values[vj] = unsorted_probs[sample_indices[vj], sampling_results[vj]] + + @T.prim_func + def scatter_probs(var_src: T.handle, var_indices: T.handle, var_dst: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size, n = T.int32(is_size_var=True), T.int32(is_size_var=True) + src = T.match_buffer(var_src, (batch_size, n)) + indices = T.match_buffer(var_indices, (batch_size,), "int32") + m = T.int32(is_size_var=True) + dst = T.match_buffer(var_dst, (m, n)) + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * n + 1023) // 1024, thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("scatter_2d"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % (n * batch_size) // n) + v1 = T.axis.spatial(n, (ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1) % n) + T.where(ax0_ax1_fused_0 * 1024 + ax0_ax1_fused_1 < batch_size * n) + T.reads(src[v0, v1], indices[v0]) + T.writes(dst[indices[v0], v1]) + dst[indices[v0], v1] = src[v0, v1] + + @T.prim_func + def shape_func(H: T.Buffer((T.int64(2),), "int64")): + T.func_attr({"tir.is_host_func": 1}) + H[T.int64(1)] = H[T.int64(0)] * T.int64(1500) + + @T.prim_func + def shape_func1(H: T.Buffer((T.int64(3),), "int64")): + T.func_attr({"tir.is_host_func": 1}) + H[T.int64(1)] = H[T.int64(0)] * T.int64(1500) + + @T.prim_func + def shape_func2(H: T.Buffer((T.int64(5),), "int64")): + T.func_attr({"tir.is_host_func": 1}) + H[T.int64(4)] = T.int64(8) * H[T.int64(1)] * T.int64(4) + H[T.int64(3)] = T.int64(8) * (H[T.int64(0)] * H[T.int64(1)] * T.int64(4)) + T.int64(8388608) + H[T.int64(0)] * H[T.int64(1)] * T.int64(12) + H[T.int64(2)] = T.int64(8) * H[T.int64(1)] * T.int64(4) * T.int64(8) + T.int64(8388608) + T.int64(8) * H[T.int64(1)] * T.int64(12) + + @T.prim_func + def shape_func3(H: T.Buffer((T.int64(6),), "int64")): + T.func_attr({"tir.is_host_func": 1}) + H[T.int64(4)] = T.int64(8) * (H[T.int64(0)] * H[T.int64(1)] * T.int64(4)) + T.int64(8388608) + H[T.int64(0)] * H[T.int64(1)] * T.int64(12) + H[T.int64(3)] = T.int64(8) * H[T.int64(1)] * T.int64(4) * T.int64(8) + T.int64(8388608) + T.int64(8) * H[T.int64(1)] * T.int64(12) + H[T.int64(5)] = T.int64(32) * H[T.int64(1)] + + @T.prim_func + def shape_func4(H: T.Buffer((T.int64(3),), "int64")): + T.func_attr({"tir.is_host_func": 1}) + H[T.int64(2)] = T.int64(8) * H[T.int64(1)] * T.int64(4) + + @T.prim_func + def shape_func5(H: T.Buffer((T.int64(5),), "int64")): + T.func_attr({"tir.is_host_func": 1}) + H[T.int64(2)] = T.int64(32) * ((H[T.int64(1)] + T.int64(4096) - T.int64(1)) // T.int64(4096)) + H[T.int64(4)] = T.int64(32) * H[T.int64(1)] + H[T.int64(3)] = (H[T.int64(1)] + T.int64(4096) - T.int64(1)) // T.int64(4096) + + @T.prim_func + def softmax_with_chunked_sum(var_A: T.handle, var_temperature: T.handle, var_chunked_sum: T.handle, var_chunked_max: T.handle, var_softmax: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int64(is_size_var=True), T.int64(is_size_var=True) + A = T.match_buffer(var_A, (batch_size, vocab_size)) + temperature = T.match_buffer(var_temperature, (batch_size,)) + num_chunks = T.int64(is_size_var=True) + chunked_sum = T.match_buffer(var_chunked_sum, (batch_size, num_chunks)) + chunked_max = T.match_buffer(var_chunked_max, (batch_size, num_chunks)) + softmax = T.match_buffer(var_softmax, (batch_size, vocab_size)) + # with T.block("root"): + temp_max_shared = T.alloc_buffer((batch_size,), scope="shared") + temp_sum_shared = T.alloc_buffer((batch_size,), scope="shared") + for l0_l1_fused in T.thread_binding(batch_size * num_chunks, thread="blockIdx.x"): + for ax0_1 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax0_0 in T.serial((num_chunks + T.int64(31)) // T.int64(32), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}): + with T.block("max"): + v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks) + v1 = T.axis.reduce(num_chunks, ax0_0 * T.int64(32) + ax0_1) + T.where(ax0_0 * T.int64(32) + ax0_1 < num_chunks) + T.reads(chunked_max[v0, v1]) + T.writes(temp_max_shared[v0]) + with T.init(): + temp_max_shared[v0] = T.float32(-3.4028234663852886e+38) + temp_max_shared[v0] = T.max(temp_max_shared[v0], chunked_max[v0, v1]) + for ax0_1 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + for ax0_0 in T.serial((num_chunks + T.int64(31)) // T.int64(32), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}): + with T.block("sum_exp"): + v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks) + v1 = T.axis.reduce(num_chunks, ax0_0 * T.int64(32) + ax0_1) + T.where(ax0_0 * T.int64(32) + ax0_1 < num_chunks) + T.reads(temperature[v0], chunked_sum[v0, v1], chunked_max[v0, v1], temp_max_shared[v0]) + T.writes(temp_sum_shared[v0]) + with T.init(): + temp_sum_shared[v0] = T.float32(0) + temp_sum_shared[v0] = temp_sum_shared[v0] + T.Select(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(chunked_sum[v0, v1] + chunked_max[v0, v1] - temp_max_shared[v0]), T.Cast("float32", chunked_max[v0, v1] == temp_max_shared[v0]) * chunked_sum[v0, v1]) + for l2_0 in T.serial(T.int64(4), annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1}): + for l2_1 in T.thread_binding(T.int64(32), thread="threadIdx.y"): + for l2_2 in T.thread_binding(T.int64(32), thread="threadIdx.x"): + with T.block("log_pad"): + v0 = T.axis.spatial(batch_size, l0_l1_fused % (num_chunks * batch_size) // num_chunks) + v1 = T.axis.spatial(num_chunks, l0_l1_fused % num_chunks) + v2 = T.axis.spatial(T.int64(4096), l2_0 * T.int64(1024) + l2_1 * T.int64(32) + l2_2) + T.reads(temperature[v0], A[v0, v1 * T.int64(4096) + v2], temp_sum_shared[v0], temp_max_shared[v0]) + T.writes(softmax[v0, v1 * T.int64(4096) + v2]) + if v1 * T.int64(4096) + v2 < vocab_size: + softmax[v0, v1 * T.int64(4096) + v2] = T.if_then_else(temperature[v0] > T.float32(1.0000000000000001e-05), T.exp(A[v0, v1 * T.int64(4096) + v2] / temperature[v0] - (T.log(temp_sum_shared[v0]) + temp_max_shared[v0])), T.Cast("float32", A[v0, v1 * T.int64(4096) + v2] == temp_max_shared[v0]) / temp_sum_shared[v0]) + + @T.prim_func + def take(model_decoder_embed_tokens_weight3: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), var_reshape707: T.handle, var_T_take: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + reshape707 = T.match_buffer(var_reshape707, (batch_size,), "int32") + T_take = T.match_buffer(var_T_take, (batch_size, T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_take"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) + T.reads(model_decoder_embed_tokens_weight3[reshape707[v0], v1], reshape707[v0]) + T.writes(T_take[v0, v1]) + T_take[v0, v1] = model_decoder_embed_tokens_weight3[reshape707[v0], v1] + + @T.prim_func + def take1(model_decoder_embed_positions_weight3: T.Buffer((T.int64(448), T.int64(1280)), "float16"), var_lv133: T.handle, var_T_take: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size = T.int64() + lv133 = T.match_buffer(var_lv133, (batch_size,), "int32") + T_take = T.match_buffer(var_T_take, (batch_size, T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_take"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) + T.reads(model_decoder_embed_positions_weight3[lv133[v0], v1], lv133[v0]) + T.writes(T_take[v0, v1]) + T_take[v0, v1] = model_decoder_embed_positions_weight3[lv133[v0], v1] + + @T.prim_func + def take2(var_layer_norm161: T.handle, var_logit_positions: T.handle, var_T_take: T.handle): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + seq_len = T.int64() + layer_norm161 = T.match_buffer(var_layer_norm161, (T.int64(1), seq_len, T.int64(1280)), "float16") + batch_size = T.int64() + logit_positions = T.match_buffer(var_logit_positions, (batch_size,), "int32") + T_take = T.match_buffer(var_T_take, (T.int64(1), batch_size, T.int64(1280)), "float16") + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_take"): + v0 = T.axis.spatial(batch_size, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) // T.int64(1280)) + v1 = T.axis.spatial(T.int64(1280), (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % T.int64(1280)) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size * T.int64(1280)) + T.reads(layer_norm161[T.int64(0), logit_positions[v0], v1], logit_positions[v0]) + T.writes(T_take[T.int64(0), v0, v1]) + T_take[T.int64(0), v0, v1] = layer_norm161[T.int64(0), logit_positions[v0], v1] + + @T.prim_func + def take3(model_decoder_embed_tokens_weight5: T.Buffer((T.int64(51866), T.int64(1280)), "float16"), reshape1353: T.Buffer((T.int64(1),), "int32"), T_take: T.Buffer((T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_take"): + v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) + T.reads(model_decoder_embed_tokens_weight5[reshape1353[T.int64(0)], v0], reshape1353[T.int64(0)]) + T.writes(T_take[T.int64(0), v0]) + T_take[T.int64(0), v0] = model_decoder_embed_tokens_weight5[reshape1353[T.int64(0)], v0] + + @T.prim_func + def take4(model_decoder_embed_positions_weight5: T.Buffer((T.int64(448), T.int64(1280)), "float16"), lv264: T.Buffer((T.int64(1),), "int32"), T_take: T.Buffer((T.int64(1), T.int64(1280)), "float16")): + T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + # with T.block("root"): + for ax0_fused_0 in T.thread_binding(T.int64(2), thread="blockIdx.x"): + for ax0_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("T_take"): + v0 = T.axis.spatial(T.int64(1280), ax0_fused_0 * T.int64(1024) + ax0_fused_1) + T.where(ax0_fused_0 * T.int64(1024) + ax0_fused_1 < T.int64(1280)) + T.reads(model_decoder_embed_positions_weight5[lv264[T.int64(0)], v0], lv264[T.int64(0)]) + T.writes(T_take[T.int64(0), v0]) + T_take[T.int64(0), v0] = model_decoder_embed_positions_weight5[lv264[T.int64(0)], v0] + + @T.prim_func + def take_sorted_probs(var_probs: T.handle, var_lv1: T.handle, var_take_sorted_probs: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + batch_size, vocab_size = T.int64(), T.int64() + probs = T.match_buffer(var_probs, (batch_size, vocab_size)) + lv1 = T.match_buffer(var_lv1, (batch_size, vocab_size), "int32") + batch_size_1, vocab_size_1 = T.int64(), T.int64() + take_sorted_probs = T.match_buffer(var_take_sorted_probs, (batch_size_1, vocab_size_1)) + # with T.block("root"): + for ax0_ax1_fused_0 in T.thread_binding((batch_size_1 * vocab_size_1 + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for ax0_ax1_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("take_sorted_probs"): + v0 = T.axis.spatial(batch_size_1, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % (vocab_size_1 * batch_size_1) // vocab_size_1) + v1 = T.axis.spatial(vocab_size_1, (ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1) % vocab_size_1) + T.where(ax0_ax1_fused_0 * T.int64(1024) + ax0_ax1_fused_1 < batch_size_1 * vocab_size_1) + T.reads(probs[v0, lv1[v0, v1]], lv1[v0, v1]) + T.writes(take_sorted_probs[v0, v1]) + take_sorted_probs[v0, v1] = probs[v0, lv1[v0, v1]] + + @T.prim_func + def tir_kv_cache_debug_get_kv(var_pages: T.handle, var_position_map: T.handle, var_k_data: T.handle, var_v_data: T.handle, layer_id: T.int64): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + num_pages, page_size = T.int64(), T.int64(is_size_var=True) + pages = T.match_buffer(var_pages, (num_pages, 2, 20, page_size, 64), "float16") + seqlen = T.int64(is_size_var=True) + position_map = T.match_buffer(var_position_map, (seqlen,), "int32", offset_factor=1) + k_data = T.match_buffer(var_k_data, (32, seqlen, 20, 64), "float16") + v_data = T.match_buffer(var_v_data, (32, seqlen, 20, 64), "float16") + # with T.block("root"): + for p_h_d_fused_0 in T.thread_binding((seqlen * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for p_h_d_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + with T.block("copy0"): + vp = T.axis.spatial(seqlen, (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) // T.int64(1280)) + vh = T.axis.spatial(20, T.Cast("int32", (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) % T.int64(1280) // T.int64(64))) + vd = T.axis.spatial(64, T.Cast("int32", (p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1) % T.int64(64))) + T.where(p_h_d_fused_0 * T.int64(1024) + p_h_d_fused_1 < seqlen * T.int64(1280)) + T.reads(position_map[vp], pages[T.Cast("int64", position_map[vp]) // page_size, 0:2, vh, T.Cast("int64", position_map[vp]) % page_size, vd]) + T.writes(k_data[layer_id, vp, vh, vd], v_data[layer_id, vp, vh, vd]) + position: T.int32 = position_map[vp] + k_data[layer_id, vp, vh, vd] = pages[T.Cast("int64", position) // page_size, 0, vh, T.Cast("int64", position) % page_size, vd] + v_data[layer_id, vp, vh, vd] = pages[T.Cast("int64", position) // page_size, 1, vh, T.Cast("int64", position) % page_size, vd] + + @T.prim_func + def tir_kv_cache_transpose_append(var_pages: T.handle, var_k_data: T.handle, var_v_data: T.handle, var_position_map: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mcpu": "znver3", "mtriple": "x86_64-pc-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + num_pages = T.int64() + pages = T.match_buffer(var_pages, (num_pages, 2, 20, 16, 64), "float16") + ntoken = T.int64(is_size_var=True) + k_data = T.match_buffer(var_k_data, (ntoken, 20, 64), "float16") + v_data = T.match_buffer(var_v_data, (ntoken, 20, 64), "float16") + position_map = T.match_buffer(var_position_map, (ntoken,), "int32", offset_factor=1) + # with T.block("root"): + for global_pos_h_f_fused_0 in T.thread_binding((ntoken * T.int64(1280) + T.int64(1023)) // T.int64(1024), thread="blockIdx.x"): + for global_pos_h_f_fused_1 in T.thread_binding(T.int64(1024), thread="threadIdx.x"): + if position_map[(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280)] != -1: + with T.block("k_transpose_append"): + vgpos = T.axis.spatial(ntoken, (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280)) + vh = T.axis.spatial(20, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(1280) // T.int64(64))) + vf = T.axis.spatial(64, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(64))) + T.where(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1 < ntoken * T.int64(1280)) + T.reads(position_map[vgpos], k_data[vgpos, vh, vf]) + T.writes(pages[position_map[vgpos] // 16, 0, vh, position_map[vgpos] % 16, vf]) + position: T.int32 = position_map[vgpos] + pages[position // 16, 0, vh, position % 16, vf] = k_data[vgpos, vh, vf] + with T.block("v_transpose_append"): + vgpos = T.axis.spatial(ntoken, (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) // T.int64(1280)) + vh = T.axis.spatial(20, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(1280) // T.int64(64))) + vf = T.axis.spatial(64, T.Cast("int32", (global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1) % T.int64(64))) + T.where(global_pos_h_f_fused_0 * T.int64(1024) + global_pos_h_f_fused_1 < ntoken * T.int64(1280)) + T.reads(position_map[vgpos], v_data[vgpos, vh, vf]) + T.writes(pages[position_map[vgpos] // 16, 1, vh, position_map[vgpos] % 16, vf]) + position: T.int32 = position_map[vgpos] + pages[position // 16, 1, vh, position % 16, vf] = v_data[vgpos, vh, vf] + + @T.prim_func + def top_p_pivot_cutoff(var_prob: T.handle, var_top_p_arr: T.handle, var_init_pivots: T.handle, var_final_pivot: T.handle, var_final_lsum: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + B, N = T.int32(), T.int32() + prob = T.match_buffer(var_prob, (B, N)) + top_p_arr = T.match_buffer(var_top_p_arr, (B,)) + init_pivots = T.match_buffer(var_init_pivots, (B, 3)) + final_pivot = T.match_buffer(var_final_pivot, (B,)) + final_lsum = T.match_buffer(var_final_lsum, (B,)) + # with T.block("root"): + pivot = T.alloc_buffer((3,), scope="local") + top_p = T.alloc_buffer((1,), scope="local") + L = T.alloc_buffer((1,), scope="shared") + R_1 = T.alloc_buffer((1,), scope="shared") + L_local = T.alloc_buffer((1,), scope="local") + R_local = T.alloc_buffer((1,), scope="local") + q = T.alloc_buffer((1,), scope="local") + lsum = T.alloc_buffer((3,), scope="local") + lmin_broadcast = T.alloc_buffer((1,), scope="shared") + lmin_broadcast_local = T.alloc_buffer((1,), scope="local") + lmin = T.alloc_buffer((3,), scope="local") + cmin = T.alloc_buffer((3,), "int32", scope="local") + total_sum = T.alloc_buffer((1,), scope="local") + it = T.alloc_buffer((1,), "int32", scope="local") + es_local = T.alloc_buffer((1,), "bool", scope="local") + es = T.alloc_buffer((1,), "bool", scope="shared") + find_pivot_local = T.alloc_buffer((1,), "bool", scope="local") + find_pivot = T.alloc_buffer((1,), "bool", scope="shared") + total_sum_reduce = T.alloc_buffer((1,), scope="local") + lsum_reduce = T.alloc_buffer((1,), scope="local") + lmin_reduce = T.alloc_buffer((1,), scope="local") + cmin_reduce = T.alloc_buffer((1,), "int32", scope="local") + for _bx in T.thread_binding(B, thread="blockIdx.x"): + for _tx in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("CTA"): + b, tx = T.axis.remap("SS", [_bx, _tx]) + T.reads(top_p_arr[b], top_p[0], L[0], R_1[0], init_pivots[b, 0:3], L_local[0], R_local[0], find_pivot_local[0], it[0], es_local[0], prob[b, it[0] * 1024 + tx], total_sum[0], q[0], pivot[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], lsum[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], lmin[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], cmin[T.min(0, it[0]):T.min(0, it[0]) + (T.max(2, it[0]) + 1 - T.min(0, it[0]))], total_sum_reduce[0], es[0], lmin_reduce[0], lmin_broadcast[0], lmin_broadcast_local[0], lsum_reduce[0], cmin_reduce[0], find_pivot[0]) + T.writes(top_p[0], L[0], R_1[0], find_pivot[0], L_local[0], R_local[0], pivot[0:3], find_pivot_local[0], final_lsum[b], final_pivot[b], lsum[0:3], lmin[0:3], cmin[0:3], total_sum[0], it[0], es_local[0], q[0], total_sum_reduce[0], es[0], lsum_reduce[0], lmin_reduce[0], lmin_broadcast[0], lmin_broadcast_local[0], cmin_reduce[0]) + top_p[0] = top_p_arr[b] + if tx == 0: + L[0] = T.float32(1) - top_p[0] + R_1[0] = T.float32(9.9999999999999995e-08) + find_pivot[0] = T.bool(False) + T.tvm_storage_sync("shared") + L_local[0] = L[0] + R_local[0] = R_1[0] + for i in T.unroll(3): + pivot[i] = init_pivots[b, i] + find_pivot_local[0] = T.bool(False) + if L_local[0] - R_local[0] <= T.float32(9.9999999999999995e-08): + if tx == 0: + final_lsum[b] = T.float32(1) + final_pivot[b] = T.float32(0) + find_pivot_local[0] = T.bool(True) + while T.tvm_thread_invariant(L_local[0] - R_local[0] > T.float32(9.9999999999999995e-08) and not find_pivot_local[0]): + T.tvm_storage_sync("shared") + for pidx in T.unroll(3): + lsum[pidx] = T.float32(0) + lmin[pidx] = T.float32(3.4028234663852886e+38) + cmin[pidx] = 0 + total_sum[0] = T.float32(0) + it[0] = 0 + es_local[0] = T.bool(False) + while it[0] < (N + 1024 - 1) // 1024 and not es_local[0]: + q[0] = T.if_then_else(it[0] * 1024 + tx < N, prob[b, it[0] * 1024 + tx], T.float32(0)) + total_sum[0] = total_sum[0] + q[0] + for pidx in T.unroll(3): + if q[0] >= pivot[pidx]: + lsum[pidx] = lsum[pidx] + q[0] + if lmin[pidx] > q[0]: + lmin[pidx] = q[0] + cmin[pidx] = 1 + else: + if lmin[pidx] == q[0]: + cmin[pidx] = cmin[pidx] + 1 + it[0] = it[0] + 1 + if it[0] % 32 == 0: + with T.block("block_cross_thread"): + T.reads(total_sum[0]) + T.writes(total_sum_reduce[0]) + T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) + T.tvm_thread_allreduce(T.uint32(1), total_sum[0], T.bool(True), total_sum_reduce[0], tx) + if tx == 0: + es[0] = T.float32(1) - total_sum_reduce[0] < pivot[2] + T.tvm_storage_sync("shared") + es_local[0] = es[0] + T.tvm_storage_sync("shared") + for pidx in range(3): + with T.block("block_cross_thread"): + T.reads(lsum[pidx]) + T.writes(lsum_reduce[0]) + T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) + T.tvm_thread_allreduce(T.uint32(1), lsum[pidx], T.bool(True), lsum_reduce[0], tx) + with T.block("block_cross_thread"): + T.reads(lmin[pidx]) + T.writes(lmin_reduce[0]) + T.attr(T.comm_reducer(lambda x0, y0: T.min(x0, y0), [T.float32(0)]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) + T.tvm_thread_allreduce(T.uint32(1), lmin[pidx], T.bool(True), lmin_reduce[0], tx) + if tx == 0: + lmin_broadcast[0] = lmin_reduce[0] + T.tvm_storage_sync("shared") + lmin_broadcast_local[0] = lmin_broadcast[0] + if lmin[pidx] > lmin_broadcast_local[0]: + cmin[pidx] = 0 + if tx == 0: + lsum[pidx] = lsum_reduce[0] + lmin[pidx] = lmin_reduce[0] + with T.block("block_cross_thread"): + T.reads(cmin[pidx]) + T.writes(cmin_reduce[0]) + T.attr(T.comm_reducer(lambda x0, y0: x0 + y0, [0]), "reduce_scope", T.reinterpret("handle", T.uint64(0))) + T.tvm_thread_allreduce(T.uint32(1), cmin[pidx], T.bool(True), cmin_reduce[0], tx) + if tx == 0: + cmin[pidx] = cmin_reduce[0] + T.tvm_storage_sync("shared") + if tx == 0: + it[0] = 0 + while it[0] < 3 and not find_pivot_local[0]: + if lsum[it[0]] >= top_p[0] and top_p[0] > lsum[it[0]] - T.Cast("float32", cmin[it[0]]) * lmin[it[0]]: + find_pivot[0] = T.bool(True) + find_pivot_local[0] = T.bool(True) + final_pivot[b] = pivot[it[0]] + final_lsum[b] = lsum[it[0]] + else: + if lsum[it[0]] - lmin[it[0]] * T.Cast("float32", cmin[it[0]]) >= top_p[0]: + R_1[0] = pivot[it[0]] + final_lsum[b] = lsum[it[0]] + else: + if lsum[it[0]] < top_p[0]: + L[0] = pivot[it[0]] + it[0] = it[0] + 1 + T.tvm_storage_sync("shared") + L_local[0] = L[0] + R_local[0] = R_1[0] + find_pivot_local[0] = find_pivot[0] + for pidx in T.unroll(3): + pivot[pidx] = L[0] - T.Cast("float32", pidx + 1) * (L_local[0] - R_local[0]) / T.float32(4) + if tx == 0: + if not find_pivot_local[0]: + final_pivot[b] = R_local[0] + if R_local[0] == T.float32(9.9999999999999995e-08): + final_lsum[b] = lsum[2] + + @T.prim_func + def top_p_renorm_after_cutoff(var_prob: T.handle, var_final_pivot: T.handle, var_final_lsum: T.handle, var_renorm_prob: T.handle): + T.func_attr({"target": T.target({"arch": "sm_89", "keys": ["cuda", "gpu"], "kind": "cuda", "libs": ["thrust"], "max_num_threads": 1024, "max_shared_memory_per_block": 49152, "max_threads_per_block": 1024, "tag": "", "thread_warp_size": 32}), "tir.is_scheduled": 1, "tir.noalias": T.bool(True)}) + B, N = T.int32(), T.int32() + prob = T.match_buffer(var_prob, (B, N)) + final_pivot = T.match_buffer(var_final_pivot, (B,)) + final_lsum = T.match_buffer(var_final_lsum, (B,)) + renorm_prob = T.match_buffer(var_renorm_prob, (B, N)) + # with T.block("root"): + pivot = T.alloc_buffer((1,), scope="local") + lsum = T.alloc_buffer((1,), scope="local") + for _by in T.thread_binding(B, thread="blockIdx.y"): + for _bx in T.thread_binding((B + 511) // B, thread="blockIdx.x"): + for _tx in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("CTA"): + by, bx, tx = T.axis.remap("SSS", [_by, _bx, _tx]) + T.reads(final_pivot[by], final_lsum[by], prob[by, T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx:T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx + (T.Select(0 <= (B + 511) // B, (N - 1) // ((B + 511) // B * 1024) * ((B + 511) // B), 0 - (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + 1)], pivot[0], lsum[0]) + T.writes(pivot[0], lsum[0], renorm_prob[by, T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx:T.Select(0 <= (B + 511) // B, 0, (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + bx * 1024 + tx + (T.Select(0 <= (B + 511) // B, (N - 1) // ((B + 511) // B * 1024) * ((B + 511) // B), 0 - (((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024) - 1) * ((B + 511) // B)) * 1024 + 1)]) + pivot[0] = final_pivot[by] + lsum[0] = final_lsum[by] + for i in range(((B + 511) // B * 1024 + N - 1) // ((B + 511) // B * 1024)): + if i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx < N: + renorm_prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] = T.if_then_else(prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] >= pivot[0], prob[by, i * ((512 + B - 1) // B) * 1024 + bx * 1024 + tx] / lsum[0], T.float32(0)) + + @R.function + def _metadata() -> R.Object: + shape_heap: R.Object = R.null_value() + return R.str("{\"model_type\": \"whisper\", \"quantization\": \"q0f16\", \"context_window_size\": 1500, \"sliding_window_size\": -1, \"attention_sink_size\": -1, \"prefill_chunk_size\": 15000, \"tensor_parallel_shards\": 1, \"kv_state_kind\": \"kv_cache\", \"max_batch_size\": 8, \"params\": [{\"name\": \"model.encoder.conv1.weight\", \"shape\": [1280, 128, 3], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv1.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv2.weight\", \"shape\": [1280, 1280, 3], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.conv2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.embed_positions.weight\", \"shape\": [1500, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.0.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.1.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.2.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.3.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.4.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.5.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.6.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.7.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.8.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.9.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.10.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.11.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.12.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.13.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.14.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.15.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.16.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.17.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.18.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.19.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.20.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.21.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.22.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.23.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.24.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.25.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.26.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.27.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.28.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.29.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.30.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layers.31.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.encoder.layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.embed_tokens.weight\", \"shape\": [51866, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.embed_positions.weight\", \"shape\": [448, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.0.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.1.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.2.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.3.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.4.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.5.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.6.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.7.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.8.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.9.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.10.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.11.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.12.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.13.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.14.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.15.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.16.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.17.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.18.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.19.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.20.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.21.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.22.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.23.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.24.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.25.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.26.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.27.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.28.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.29.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.30.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.self_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.k_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.v_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.v_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.q_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.q_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.out_proj.weight\", \"shape\": [1280, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn.out_proj.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.encoder_attn_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc1.weight\", \"shape\": [5120, 1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc1.bias\", \"shape\": [5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc2.weight\", \"shape\": [1280, 5120], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.fc2.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.final_layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layers.31.final_layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layer_norm.weight\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}, {\"name\": \"model.decoder.layer_norm.bias\", \"shape\": [1280], \"dtype\": \"float16\", \"preprocs\": []}], \"kv_cache\": {\"num_hidden_layers\": 32, \"num_attention_heads\": 20, \"num_key_value_heads\": 20, \"head_dim\": 64}, \"memory_usage\": {\"argsort_probs\": 0, \"batch_compute_cross_attn_kv\": 61440000, \"batch_decode\": 1987392, \"batch_encode\": 276480000, \"batch_prefill\": 616080192, \"create_tir_paged_kv_cache\": 0, \"decode\": 243304, \"multinomial_from_uniform\": 32, \"prefill\": 614610024, \"renormalize_by_top_p\": 64, \"sample_with_top_p\": 64, \"sampler_take_probs\": 416, \"sampler_verify_draft_tokens\": 0, \"softmax_with_temperature\": 0}}") + + @R.function + def argsort_probs(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32")) -> R.Tuple(R.Tensor(("batch_size", "vocab_size"), dtype="float32"), R.Tensor(("batch_size", "vocab_size"), dtype="int32")): + batch_size = T.int64() + vocab_size = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=argsort_probs, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + cls.shape_func2(shape_heap) + gv2560: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) + storage30: R.Object = R.vm.alloc_storage(gv2560, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2561: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) + lv: R.Tensor(dtype="uint8", ndim=1) = R.vm.alloc_tensor(storage30, R.prim_value(0), gv2561, R.dtype("uint8")) + R.vm.kill_object(storage30) + gv2562: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) + storage31: R.Object = R.vm.alloc_storage(gv2562, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2563: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc1976: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage31, R.prim_value(0), gv2563, R.dtype("int32")) + R.vm.kill_object(storage31) + cls.argsort_thrust(probs, lv, alloc1976) + R.vm.kill_object(lv) + gv2564: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) + storage32: R.Object = R.vm.alloc_storage(gv2564, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2565: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc1977: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage32, R.prim_value(0), gv2565, R.dtype("float32")) + R.vm.kill_object(storage32) + cls.take_sorted_probs(probs, alloc1976, alloc1977) + gv1: R.Tuple(R.Tensor(dtype="float32", ndim=2), R.Tensor(dtype="int32", ndim=2)) = alloc1977, alloc1976 + R.vm.kill_object(alloc1976) + R.vm.kill_object(alloc1977) + gv2566: R.Tensor(dtype="float32", ndim=2) = gv1[0] + R.call_packed("vm.builtin.match_shape", gv2566, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=return, annotation=R.Tuple(R.Tensor((batch_size, vocab_size), dtype=\"float32\"), R.Tensor((batch_size, vocab_size), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) + gv2567: R.Tensor(dtype="int32", ndim=2) = gv1[1] + R.call_packed("vm.builtin.match_shape", gv2567, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=argsort_probs, loc=return, annotation=R.Tuple(R.Tensor((batch_size, vocab_size), dtype=\"float32\"), R.Tensor((batch_size, vocab_size), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) + return gv1 + + @R.function + def batch_compute_cross_attn_kv(encoder_hidden_states: R.Tensor(("batch_size", 1500, 1280), dtype="float16"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Object: + batch_size = T.int64() + R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", encoder_hidden_states, R.prim_value(3), R.dtype("float16"), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[0], param=encoder_hidden_states, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", encoder_hidden_states, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), R.str("ErrorContext(fn=batch_compute_cross_attn_kv, loc=param[0], param=encoder_hidden_states, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) + cls.shape_func(shape_heap) + model_decoder_layers_0_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[498] + storage11: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv883: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc554: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv883, R.dtype("float16")) + _552: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_0_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc554) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_k_proj_weight1) + gv884: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape256: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc554, gv884, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc554) + model_decoder_layers_0_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[499] + model_decoder_layers_0_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[500] + storage12: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv885: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc555: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv885, R.dtype("float16")) + _553: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_0_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_0_encoder_attn_v_proj_bias1, alloc555) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_v_proj_bias1) + gv886: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape257: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc555, gv886, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc555) + gv887: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape258: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape256, gv887, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape256) + gv888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape259: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape257, gv888, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape257) + lv36: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", paged_kv_cache, R.prim_value(0), reshape258, reshape259, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape258) + R.vm.kill_object(reshape259) + model_decoder_layers_1_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[522] + gv889: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc556: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv889, R.dtype("float16")) + _554: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_1_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc556) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_k_proj_weight1) + gv890: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape260: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc556, gv890, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc556) + model_decoder_layers_1_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[523] + model_decoder_layers_1_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[524] + gv891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc557: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv891, R.dtype("float16")) + _555: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_1_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_1_encoder_attn_v_proj_bias1, alloc557) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_v_proj_bias1) + gv892: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape261: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc557, gv892, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc557) + gv893: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape262: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape260, gv893, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape260) + gv894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape263: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape261, gv894, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape261) + lv37: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv36, R.prim_value(1), reshape262, reshape263, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape262) + R.vm.kill_object(reshape263) + R.vm.kill_object(lv36) + model_decoder_layers_2_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[546] + gv895: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc558: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv895, R.dtype("float16")) + _556: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_2_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc558) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_k_proj_weight1) + gv896: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape264: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc558, gv896, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc558) + model_decoder_layers_2_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[547] + model_decoder_layers_2_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[548] + gv897: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc559: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv897, R.dtype("float16")) + _557: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_2_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_2_encoder_attn_v_proj_bias1, alloc559) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_v_proj_bias1) + gv898: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape265: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc559, gv898, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc559) + gv899: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape266: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape264, gv899, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape264) + gv900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape267: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape265, gv900, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape265) + lv38: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv37, R.prim_value(2), reshape266, reshape267, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape266) + R.vm.kill_object(reshape267) + R.vm.kill_object(lv37) + model_decoder_layers_3_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[570] + gv901: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc560: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv901, R.dtype("float16")) + _558: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_3_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc560) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_k_proj_weight1) + gv902: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape268: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc560, gv902, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc560) + model_decoder_layers_3_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[571] + model_decoder_layers_3_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[572] + gv903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc561: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv903, R.dtype("float16")) + _559: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_3_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_3_encoder_attn_v_proj_bias1, alloc561) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_v_proj_bias1) + gv904: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape269: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc561, gv904, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc561) + gv905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape270: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape268, gv905, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape268) + gv906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape271: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape269, gv906, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape269) + lv39: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv38, R.prim_value(3), reshape270, reshape271, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape270) + R.vm.kill_object(reshape271) + R.vm.kill_object(lv38) + model_decoder_layers_4_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[594] + gv907: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc562: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv907, R.dtype("float16")) + _560: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_4_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc562) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_k_proj_weight1) + gv908: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape272: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc562, gv908, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc562) + model_decoder_layers_4_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[595] + model_decoder_layers_4_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[596] + gv909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc563: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv909, R.dtype("float16")) + _561: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_4_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_4_encoder_attn_v_proj_bias1, alloc563) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_v_proj_bias1) + gv910: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape273: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc563, gv910, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc563) + gv911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape274: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape272, gv911, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape272) + gv912: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape275: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape273, gv912, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape273) + lv40: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv39, R.prim_value(4), reshape274, reshape275, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape274) + R.vm.kill_object(reshape275) + R.vm.kill_object(lv39) + model_decoder_layers_5_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[618] + gv913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc564: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv913, R.dtype("float16")) + _562: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_5_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc564) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_k_proj_weight1) + gv914: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape276: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc564, gv914, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc564) + model_decoder_layers_5_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[619] + model_decoder_layers_5_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[620] + gv915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc565: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv915, R.dtype("float16")) + _563: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_5_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_5_encoder_attn_v_proj_bias1, alloc565) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_v_proj_bias1) + gv916: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape277: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc565, gv916, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc565) + gv917: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape278: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape276, gv917, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape276) + gv918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape279: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape277, gv918, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape277) + lv41: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv40, R.prim_value(5), reshape278, reshape279, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape278) + R.vm.kill_object(reshape279) + R.vm.kill_object(lv40) + model_decoder_layers_6_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[642] + gv919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc566: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv919, R.dtype("float16")) + _564: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_6_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc566) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_k_proj_weight1) + gv920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape280: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc566, gv920, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc566) + model_decoder_layers_6_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[643] + model_decoder_layers_6_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[644] + gv921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc567: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv921, R.dtype("float16")) + _565: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_6_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_6_encoder_attn_v_proj_bias1, alloc567) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_v_proj_bias1) + gv922: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape281: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc567, gv922, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc567) + gv923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape282: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape280, gv923, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape280) + gv924: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape283: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape281, gv924, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape281) + lv42: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv41, R.prim_value(6), reshape282, reshape283, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape282) + R.vm.kill_object(reshape283) + R.vm.kill_object(lv41) + model_decoder_layers_7_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[666] + gv925: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc568: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv925, R.dtype("float16")) + _566: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_7_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc568) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_k_proj_weight1) + gv926: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape284: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc568, gv926, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc568) + model_decoder_layers_7_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[667] + model_decoder_layers_7_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[668] + gv927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc569: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv927, R.dtype("float16")) + _567: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_7_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_7_encoder_attn_v_proj_bias1, alloc569) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_v_proj_bias1) + gv928: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape285: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc569, gv928, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc569) + gv929: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape286: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape284, gv929, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape284) + gv930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape287: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape285, gv930, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape285) + lv43: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv42, R.prim_value(7), reshape286, reshape287, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape286) + R.vm.kill_object(reshape287) + R.vm.kill_object(lv42) + model_decoder_layers_8_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[690] + gv931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc570: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv931, R.dtype("float16")) + _568: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_8_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc570) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_k_proj_weight1) + gv932: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape288: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc570, gv932, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc570) + model_decoder_layers_8_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[691] + model_decoder_layers_8_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[692] + gv933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc571: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv933, R.dtype("float16")) + _569: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_8_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_8_encoder_attn_v_proj_bias1, alloc571) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_v_proj_bias1) + gv934: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape289: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc571, gv934, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc571) + gv935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape290: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape288, gv935, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape288) + gv936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape291: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape289, gv936, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape289) + lv44: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv43, R.prim_value(8), reshape290, reshape291, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape290) + R.vm.kill_object(reshape291) + R.vm.kill_object(lv43) + model_decoder_layers_9_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[714] + gv937: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc572: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv937, R.dtype("float16")) + _570: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_9_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc572) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_k_proj_weight1) + gv938: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape292: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc572, gv938, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc572) + model_decoder_layers_9_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[715] + model_decoder_layers_9_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[716] + gv939: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc573: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv939, R.dtype("float16")) + _571: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_9_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_9_encoder_attn_v_proj_bias1, alloc573) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_v_proj_bias1) + gv940: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape293: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc573, gv940, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc573) + gv941: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape294: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape292, gv941, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape292) + gv942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape295: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape293, gv942, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape293) + lv45: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv44, R.prim_value(9), reshape294, reshape295, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape294) + R.vm.kill_object(reshape295) + R.vm.kill_object(lv44) + model_decoder_layers_10_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[738] + gv943: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc574: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv943, R.dtype("float16")) + _572: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_10_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc574) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_k_proj_weight1) + gv944: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape296: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc574, gv944, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc574) + model_decoder_layers_10_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[739] + model_decoder_layers_10_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[740] + gv945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc575: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv945, R.dtype("float16")) + _573: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_10_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_10_encoder_attn_v_proj_bias1, alloc575) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_v_proj_bias1) + gv946: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape297: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc575, gv946, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc575) + gv947: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape298: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape296, gv947, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape296) + gv948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape299: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape297, gv948, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape297) + lv46: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv45, R.prim_value(10), reshape298, reshape299, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape298) + R.vm.kill_object(reshape299) + R.vm.kill_object(lv45) + model_decoder_layers_11_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[762] + gv949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc576: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv949, R.dtype("float16")) + _574: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_11_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc576) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_k_proj_weight1) + gv950: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape300: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc576, gv950, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc576) + model_decoder_layers_11_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[763] + model_decoder_layers_11_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[764] + gv951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc577: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv951, R.dtype("float16")) + _575: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_11_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_11_encoder_attn_v_proj_bias1, alloc577) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_v_proj_bias1) + gv952: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape301: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc577, gv952, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc577) + gv953: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape302: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape300, gv953, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape300) + gv954: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape303: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape301, gv954, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape301) + lv47: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv46, R.prim_value(11), reshape302, reshape303, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape302) + R.vm.kill_object(reshape303) + R.vm.kill_object(lv46) + model_decoder_layers_12_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[786] + gv955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc578: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv955, R.dtype("float16")) + _576: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_12_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc578) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_k_proj_weight1) + gv956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape304: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc578, gv956, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc578) + model_decoder_layers_12_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[787] + model_decoder_layers_12_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[788] + gv957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc579: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv957, R.dtype("float16")) + _577: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_12_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_12_encoder_attn_v_proj_bias1, alloc579) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_v_proj_bias1) + gv958: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape305: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc579, gv958, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc579) + gv959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape306: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape304, gv959, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape304) + gv960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape307: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape305, gv960, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape305) + lv48: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv47, R.prim_value(12), reshape306, reshape307, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape306) + R.vm.kill_object(reshape307) + R.vm.kill_object(lv47) + model_decoder_layers_13_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[810] + gv961: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc580: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv961, R.dtype("float16")) + _578: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_13_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc580) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_k_proj_weight1) + gv962: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape308: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc580, gv962, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc580) + model_decoder_layers_13_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[811] + model_decoder_layers_13_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[812] + gv963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc581: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv963, R.dtype("float16")) + _579: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_13_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_13_encoder_attn_v_proj_bias1, alloc581) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_v_proj_bias1) + gv964: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape309: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc581, gv964, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc581) + gv965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape310: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape308, gv965, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape308) + gv966: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape311: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape309, gv966, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape309) + lv49: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv48, R.prim_value(13), reshape310, reshape311, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape310) + R.vm.kill_object(reshape311) + R.vm.kill_object(lv48) + model_decoder_layers_14_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[834] + gv967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc582: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv967, R.dtype("float16")) + _580: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_14_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc582) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_k_proj_weight1) + gv968: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape312: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc582, gv968, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc582) + model_decoder_layers_14_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[835] + model_decoder_layers_14_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[836] + gv969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc583: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv969, R.dtype("float16")) + _581: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_14_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_14_encoder_attn_v_proj_bias1, alloc583) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_v_proj_bias1) + gv970: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape313: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc583, gv970, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc583) + gv971: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape314: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape312, gv971, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape312) + gv972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape315: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape313, gv972, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape313) + lv50: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv49, R.prim_value(14), reshape314, reshape315, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape314) + R.vm.kill_object(reshape315) + R.vm.kill_object(lv49) + model_decoder_layers_15_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[858] + gv973: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc584: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv973, R.dtype("float16")) + _582: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_15_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc584) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_k_proj_weight1) + gv974: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape316: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc584, gv974, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc584) + model_decoder_layers_15_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[859] + model_decoder_layers_15_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[860] + gv975: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc585: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv975, R.dtype("float16")) + _583: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_15_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_15_encoder_attn_v_proj_bias1, alloc585) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_v_proj_bias1) + gv976: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape317: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc585, gv976, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc585) + gv977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape318: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape316, gv977, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape316) + gv978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape319: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape317, gv978, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape317) + lv51: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv50, R.prim_value(15), reshape318, reshape319, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape318) + R.vm.kill_object(reshape319) + R.vm.kill_object(lv50) + model_decoder_layers_16_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[882] + gv979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc586: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv979, R.dtype("float16")) + _584: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_16_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc586) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_k_proj_weight1) + gv980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape320: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc586, gv980, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc586) + model_decoder_layers_16_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[883] + model_decoder_layers_16_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[884] + gv981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc587: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv981, R.dtype("float16")) + _585: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_16_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_16_encoder_attn_v_proj_bias1, alloc587) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_v_proj_bias1) + gv982: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape321: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc587, gv982, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc587) + gv983: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape322: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape320, gv983, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape320) + gv984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape323: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape321, gv984, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape321) + lv52: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv51, R.prim_value(16), reshape322, reshape323, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape322) + R.vm.kill_object(reshape323) + R.vm.kill_object(lv51) + model_decoder_layers_17_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[906] + gv985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc588: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv985, R.dtype("float16")) + _586: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_17_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc588) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_k_proj_weight1) + gv986: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape324: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc588, gv986, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc588) + model_decoder_layers_17_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[907] + model_decoder_layers_17_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[908] + gv987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc589: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv987, R.dtype("float16")) + _587: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_17_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_17_encoder_attn_v_proj_bias1, alloc589) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_v_proj_bias1) + gv988: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape325: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc589, gv988, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc589) + gv989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape326: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape324, gv989, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape324) + gv990: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape327: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape325, gv990, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape325) + lv53: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv52, R.prim_value(17), reshape326, reshape327, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape326) + R.vm.kill_object(reshape327) + R.vm.kill_object(lv52) + model_decoder_layers_18_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[930] + gv991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc590: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv991, R.dtype("float16")) + _588: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_18_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc590) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_k_proj_weight1) + gv992: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape328: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc590, gv992, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc590) + model_decoder_layers_18_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[931] + model_decoder_layers_18_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[932] + gv993: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc591: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv993, R.dtype("float16")) + _589: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_18_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_18_encoder_attn_v_proj_bias1, alloc591) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_v_proj_bias1) + gv994: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape329: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc591, gv994, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc591) + gv995: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape330: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape328, gv995, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape328) + gv996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape331: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape329, gv996, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape329) + lv54: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv53, R.prim_value(18), reshape330, reshape331, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape330) + R.vm.kill_object(reshape331) + R.vm.kill_object(lv53) + model_decoder_layers_19_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[954] + gv997: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc592: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv997, R.dtype("float16")) + _590: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_19_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc592) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_k_proj_weight1) + gv998: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape332: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc592, gv998, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc592) + model_decoder_layers_19_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[955] + model_decoder_layers_19_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[956] + gv999: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc593: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv999, R.dtype("float16")) + _591: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_19_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_19_encoder_attn_v_proj_bias1, alloc593) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_v_proj_bias1) + gv1000: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape333: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc593, gv1000, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc593) + gv1001: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape334: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape332, gv1001, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape332) + gv1002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape335: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape333, gv1002, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape333) + lv55: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv54, R.prim_value(19), reshape334, reshape335, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape334) + R.vm.kill_object(reshape335) + R.vm.kill_object(lv54) + model_decoder_layers_20_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[978] + gv1003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc594: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1003, R.dtype("float16")) + _592: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_20_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc594) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_k_proj_weight1) + gv1004: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape336: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc594, gv1004, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc594) + model_decoder_layers_20_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[979] + model_decoder_layers_20_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[980] + gv1005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc595: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1005, R.dtype("float16")) + _593: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_20_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_20_encoder_attn_v_proj_bias1, alloc595) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_v_proj_bias1) + gv1006: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape337: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc595, gv1006, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc595) + gv1007: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape338: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape336, gv1007, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape336) + gv1008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape339: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape337, gv1008, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape337) + lv56: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv55, R.prim_value(20), reshape338, reshape339, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape338) + R.vm.kill_object(reshape339) + R.vm.kill_object(lv55) + model_decoder_layers_21_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1002] + gv1009: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc596: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1009, R.dtype("float16")) + _594: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_21_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc596) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_k_proj_weight1) + gv1010: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape340: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc596, gv1010, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc596) + model_decoder_layers_21_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1003] + model_decoder_layers_21_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1004] + gv1011: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc597: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1011, R.dtype("float16")) + _595: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_21_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_21_encoder_attn_v_proj_bias1, alloc597) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_v_proj_bias1) + gv1012: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape341: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc597, gv1012, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc597) + gv1013: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape342: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape340, gv1013, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape340) + gv1014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape343: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape341, gv1014, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape341) + lv57: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv56, R.prim_value(21), reshape342, reshape343, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape342) + R.vm.kill_object(reshape343) + R.vm.kill_object(lv56) + model_decoder_layers_22_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1026] + gv1015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc598: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1015, R.dtype("float16")) + _596: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_22_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc598) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_k_proj_weight1) + gv1016: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape344: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc598, gv1016, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc598) + model_decoder_layers_22_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1027] + model_decoder_layers_22_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1028] + gv1017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc599: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1017, R.dtype("float16")) + _597: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_22_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_22_encoder_attn_v_proj_bias1, alloc599) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_v_proj_bias1) + gv1018: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape345: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc599, gv1018, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc599) + gv1019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape346: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape344, gv1019, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape344) + gv1020: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape347: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape345, gv1020, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape345) + lv58: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv57, R.prim_value(22), reshape346, reshape347, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape346) + R.vm.kill_object(reshape347) + R.vm.kill_object(lv57) + model_decoder_layers_23_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1050] + gv1021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc600: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1021, R.dtype("float16")) + _598: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_23_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc600) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_k_proj_weight1) + gv1022: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape348: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc600, gv1022, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc600) + model_decoder_layers_23_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1051] + model_decoder_layers_23_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1052] + gv1023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc601: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1023, R.dtype("float16")) + _599: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_23_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_23_encoder_attn_v_proj_bias1, alloc601) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_v_proj_bias1) + gv1024: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape349: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc601, gv1024, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc601) + gv1025: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape350: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape348, gv1025, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape348) + gv1026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape351: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape349, gv1026, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape349) + lv59: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv58, R.prim_value(23), reshape350, reshape351, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape350) + R.vm.kill_object(reshape351) + R.vm.kill_object(lv58) + model_decoder_layers_24_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1074] + gv1027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc602: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1027, R.dtype("float16")) + _600: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_24_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc602) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_k_proj_weight1) + gv1028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape352: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc602, gv1028, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc602) + model_decoder_layers_24_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1075] + model_decoder_layers_24_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1076] + gv1029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc603: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1029, R.dtype("float16")) + _601: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_24_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_24_encoder_attn_v_proj_bias1, alloc603) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_v_proj_bias1) + gv1030: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape353: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc603, gv1030, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc603) + gv1031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape354: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape352, gv1031, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape352) + gv1032: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape355: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape353, gv1032, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape353) + lv60: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv59, R.prim_value(24), reshape354, reshape355, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape354) + R.vm.kill_object(reshape355) + R.vm.kill_object(lv59) + model_decoder_layers_25_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1098] + gv1033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc604: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1033, R.dtype("float16")) + _602: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_25_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc604) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_k_proj_weight1) + gv1034: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape356: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc604, gv1034, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc604) + model_decoder_layers_25_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1099] + model_decoder_layers_25_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1100] + gv1035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc605: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1035, R.dtype("float16")) + _603: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_25_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_25_encoder_attn_v_proj_bias1, alloc605) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_v_proj_bias1) + gv1036: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape357: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc605, gv1036, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc605) + gv1037: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape358: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape356, gv1037, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape356) + gv1038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape359: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape357, gv1038, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape357) + lv61: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv60, R.prim_value(25), reshape358, reshape359, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape358) + R.vm.kill_object(reshape359) + R.vm.kill_object(lv60) + model_decoder_layers_26_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1122] + gv1039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc606: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1039, R.dtype("float16")) + _604: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_26_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc606) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_k_proj_weight1) + gv1040: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape360: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc606, gv1040, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc606) + model_decoder_layers_26_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1123] + model_decoder_layers_26_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1124] + gv1041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc607: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1041, R.dtype("float16")) + _605: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_26_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_26_encoder_attn_v_proj_bias1, alloc607) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_v_proj_bias1) + gv1042: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape361: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc607, gv1042, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc607) + gv1043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape362: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape360, gv1043, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape360) + gv1044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape363: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape361, gv1044, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape361) + lv62: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv61, R.prim_value(26), reshape362, reshape363, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape362) + R.vm.kill_object(reshape363) + R.vm.kill_object(lv61) + model_decoder_layers_27_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1146] + gv1045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc608: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1045, R.dtype("float16")) + _606: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_27_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc608) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_k_proj_weight1) + gv1046: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape364: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc608, gv1046, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc608) + model_decoder_layers_27_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1147] + model_decoder_layers_27_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1148] + gv1047: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc609: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1047, R.dtype("float16")) + _607: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_27_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_27_encoder_attn_v_proj_bias1, alloc609) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_v_proj_bias1) + gv1048: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape365: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc609, gv1048, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc609) + gv1049: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape366: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape364, gv1049, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape364) + gv1050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape367: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape365, gv1050, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape365) + lv63: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv62, R.prim_value(27), reshape366, reshape367, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape366) + R.vm.kill_object(reshape367) + R.vm.kill_object(lv62) + model_decoder_layers_28_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1170] + gv1051: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc610: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1051, R.dtype("float16")) + _608: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_28_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc610) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_k_proj_weight1) + gv1052: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape368: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc610, gv1052, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc610) + model_decoder_layers_28_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1171] + model_decoder_layers_28_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1172] + gv1053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc611: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1053, R.dtype("float16")) + _609: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_28_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_28_encoder_attn_v_proj_bias1, alloc611) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_v_proj_bias1) + gv1054: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape369: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc611, gv1054, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc611) + gv1055: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape370: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape368, gv1055, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape368) + gv1056: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape371: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape369, gv1056, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape369) + lv64: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv63, R.prim_value(28), reshape370, reshape371, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape370) + R.vm.kill_object(reshape371) + R.vm.kill_object(lv63) + model_decoder_layers_29_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1194] + gv1057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc612: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1057, R.dtype("float16")) + _610: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_29_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc612) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_k_proj_weight1) + gv1058: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape372: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc612, gv1058, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc612) + model_decoder_layers_29_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1195] + model_decoder_layers_29_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1196] + gv1059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc613: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1059, R.dtype("float16")) + _611: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_29_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_29_encoder_attn_v_proj_bias1, alloc613) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_v_proj_bias1) + gv1060: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape373: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc613, gv1060, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc613) + gv1061: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape374: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape372, gv1061, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape372) + gv1062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape375: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape373, gv1062, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape373) + lv65: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv64, R.prim_value(29), reshape374, reshape375, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape374) + R.vm.kill_object(reshape375) + R.vm.kill_object(lv64) + model_decoder_layers_30_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1218] + gv1063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc614: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1063, R.dtype("float16")) + _612: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_30_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc614) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_k_proj_weight1) + gv1064: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape376: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc614, gv1064, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc614) + model_decoder_layers_30_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1219] + model_decoder_layers_30_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1220] + gv1065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc615: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1065, R.dtype("float16")) + _613: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_30_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_30_encoder_attn_v_proj_bias1, alloc615) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_v_proj_bias1) + gv1066: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape377: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc615, gv1066, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc615) + gv1067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape378: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape376, gv1067, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape376) + gv1068: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape379: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape377, gv1068, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape377) + lv66: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv65, R.prim_value(30), reshape378, reshape379, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape378) + R.vm.kill_object(reshape379) + R.vm.kill_object(lv65) + model_decoder_layers_31_encoder_attn_k_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1242] + gv1069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc616: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage11, R.prim_value(0), gv1069, R.dtype("float16")) + R.vm.kill_object(storage11) + _614: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_decoder_layers_31_encoder_attn_k_proj_weight1, encoder_hidden_states, alloc616) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_k_proj_weight1) + gv1070: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape380: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc616, gv1070, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc616) + model_decoder_layers_31_encoder_attn_v_proj_weight1: R.Tensor((1280, 1280), dtype="float16") = packed_params[1243] + model_decoder_layers_31_encoder_attn_v_proj_bias1: R.Tensor((1280,), dtype="float16") = packed_params[1244] + gv1071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc617: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage12, R.prim_value(0), gv1071, R.dtype("float16")) + R.vm.kill_object(storage12) + _615: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_decoder_layers_31_encoder_attn_v_proj_weight1, encoder_hidden_states, model_decoder_layers_31_encoder_attn_v_proj_bias1, alloc617) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_v_proj_weight1) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_v_proj_bias1) + gv1072: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape381: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc617, gv1072, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc617) + gv1073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape382: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape380, gv1073, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape380) + gv1074: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape383: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape381, gv1074, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape381) + gv1: R.Object = R.call_packed("vm.builtin.attention_kv_cache_push_cross_attention_kv", lv66, R.prim_value(31), reshape382, reshape383, sinfo_args=(R.Object,)) + R.vm.kill_object(reshape382) + R.vm.kill_object(reshape383) + R.vm.kill_object(lv66) + return gv1 + + @R.function + def batch_decode(input_ids: R.Tensor(("batch_size", 1), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor(("batch_size", 1, 51866), dtype="float32"): + batch_size = T.int64() + R.func_attr({"num_input": 2, "relax.force_pure": 1, "relax.rewrite_cuda_graph.capture_symbolic_vars": ["batch_size"], "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=batch_decode, loc=param[0], param=input_ids, annotation=R.Tensor((batch_size, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_decode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.str("ErrorContext(fn=batch_decode, loc=param[0], param=input_ids, annotation=R.Tensor((batch_size, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + model_decoder_embed_tokens_weight3: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + gv1075: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) + reshape707: R.Tensor((batch_size,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv1075, sinfo_args=(R.Tensor((batch_size,), dtype="int32"),)) + model_decoder_embed_tokens_weight3_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + storage13: R.Object = R.vm.alloc_storage(R.shape([81920]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1076: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) + alloc618: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1076, R.dtype("float16")) + cls.take(model_decoder_embed_tokens_weight3_1, reshape707, alloc618) + R.vm.kill_object(reshape707) + R.vm.kill_object(model_decoder_embed_tokens_weight3_1) + gv1077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape708: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc618, gv1077, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc618) + lv133: R.Tensor((batch_size,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((batch_size,), dtype="int32"),)) + model_decoder_embed_positions_weight3: R.Tensor((448, 1280), dtype="float16") = packed_params[488] + storage14: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1078: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) + alloc619: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1078, R.dtype("float16")) + cls.take1(model_decoder_embed_positions_weight3, lv133, alloc619) + R.vm.kill_object(lv133) + R.vm.kill_object(model_decoder_embed_positions_weight3) + gv1079: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape709: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc619, gv1079, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc619) + storage15: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc620: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1080, R.dtype("float16")) + cls.add(reshape708, reshape709, alloc620) + R.vm.kill_object(reshape708) + R.vm.kill_object(reshape709) + model_decoder_layers_0_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[496] + model_decoder_layers_0_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[497] + gv1081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc621: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1081, R.dtype("float16")) + cls.layer_norm(alloc620, model_decoder_layers_0_self_attn_layer_norm_weight3, model_decoder_layers_0_self_attn_layer_norm_bias3, alloc621) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias3) + model_decoder_layers_0_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] + model_decoder_layers_0_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[493] + gv1082: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc622: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1082, R.dtype("float16")) + _620: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_q_proj_weight3, alloc621, model_decoder_layers_0_self_attn_q_proj_bias3, alloc622) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias3) + gv1083: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape710: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc622, gv1083, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc622) + model_decoder_layers_0_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] + storage16: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc623: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1084, R.dtype("float16")) + _621: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_0_self_attn_k_proj_weight3, alloc621, alloc623) + R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight3) + gv1085: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape711: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc623, gv1085, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc623) + model_decoder_layers_0_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] + model_decoder_layers_0_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[491] + storage17: R.Object = R.vm.alloc_storage(R.shape([61440]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc624: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1086, R.dtype("float16")) + _622: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_v_proj_weight3, alloc621, model_decoder_layers_0_self_attn_v_proj_bias3, alloc624) + R.vm.kill_object(alloc621) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias3) + gv1087: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape712: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc624, gv1087, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc624) + gv1088: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc625: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1088, R.dtype("float16")) + cls.concatenate(reshape710, reshape711, reshape712, alloc625) + R.vm.kill_object(reshape710) + R.vm.kill_object(reshape711) + R.vm.kill_object(reshape712) + gv1089: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape713: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc625, gv1089, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc625) + gv1090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc626: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1090, R.dtype("float16")) + _624: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape713, alloc626) + R.vm.kill_object(reshape713) + gv1091: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape714: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc626, gv1091, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc626) + gv1092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape715: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape714, gv1092, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape714) + model_decoder_layers_0_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] + model_decoder_layers_0_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[495] + gv1093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc627: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1093, R.dtype("float16")) + _625: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_self_attn_out_proj_weight3, reshape715, model_decoder_layers_0_self_attn_out_proj_bias3, alloc627) + R.vm.kill_object(reshape715) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias3) + gv1094: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc628: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1094, R.dtype("float16")) + cls.add(alloc620, alloc627, alloc628) + R.vm.kill_object(alloc620) + R.vm.kill_object(alloc627) + model_decoder_layers_0_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[505] + model_decoder_layers_0_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[506] + gv1095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc629: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1095, R.dtype("float16")) + cls.layer_norm(alloc628, model_decoder_layers_0_encoder_attn_layer_norm_weight3, model_decoder_layers_0_encoder_attn_layer_norm_bias3, alloc629) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias3) + model_decoder_layers_0_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] + model_decoder_layers_0_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[502] + gv1096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc630: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1096, R.dtype("float16")) + _628: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight3, alloc629, model_decoder_layers_0_encoder_attn_q_proj_bias3, alloc630) + R.vm.kill_object(alloc629) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias3) + gv1097: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape716: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc630, gv1097, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc630) + gv1098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape717: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape716, gv1098, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape716) + gv1099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc631: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1099, R.dtype("float16")) + _629: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape717, alloc631) + R.vm.kill_object(reshape717) + gv1100: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape718: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc631, gv1100, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc631) + gv1101: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape719: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape718, gv1101, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape718) + model_decoder_layers_0_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] + model_decoder_layers_0_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[504] + gv1102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc632: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1102, R.dtype("float16")) + _630: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight3, reshape719, model_decoder_layers_0_encoder_attn_out_proj_bias3, alloc632) + R.vm.kill_object(reshape719) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias3) + gv1103: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc633: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1103, R.dtype("float16")) + cls.add(alloc628, alloc632, alloc633) + R.vm.kill_object(alloc628) + R.vm.kill_object(alloc632) + model_decoder_layers_0_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[511] + model_decoder_layers_0_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[512] + gv1104: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc634: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1104, R.dtype("float16")) + cls.layer_norm(alloc633, model_decoder_layers_0_final_layer_norm_weight3, model_decoder_layers_0_final_layer_norm_bias3, alloc634) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias3) + model_decoder_layers_0_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] + model_decoder_layers_0_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[508] + gv1105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc635: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1105, R.dtype("float16")) + _633: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_0_fc1_weight3, alloc634, model_decoder_layers_0_fc1_bias3, alloc635) + R.vm.kill_object(alloc634) + R.vm.kill_object(model_decoder_layers_0_fc1_weight3) + R.vm.kill_object(model_decoder_layers_0_fc1_bias3) + model_decoder_layers_0_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] + model_decoder_layers_0_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[510] + gv1106: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc636: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1106, R.dtype("float16")) + _634: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_0_fc2_weight3, alloc635, model_decoder_layers_0_fc2_bias3, alloc636) + R.vm.kill_object(alloc635) + R.vm.kill_object(model_decoder_layers_0_fc2_weight3) + R.vm.kill_object(model_decoder_layers_0_fc2_bias3) + gv1107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc637: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1107, R.dtype("float16")) + cls.add(alloc633, alloc636, alloc637) + R.vm.kill_object(alloc633) + R.vm.kill_object(alloc636) + model_decoder_layers_1_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[520] + model_decoder_layers_1_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[521] + gv1108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc638: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1108, R.dtype("float16")) + cls.layer_norm(alloc637, model_decoder_layers_1_self_attn_layer_norm_weight3, model_decoder_layers_1_self_attn_layer_norm_bias3, alloc638) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias3) + model_decoder_layers_1_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] + model_decoder_layers_1_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[517] + gv1109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc639: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1109, R.dtype("float16")) + _637: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_q_proj_weight3, alloc638, model_decoder_layers_1_self_attn_q_proj_bias3, alloc639) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias3) + gv1110: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape720: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc639, gv1110, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc639) + model_decoder_layers_1_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] + gv1111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc640: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1111, R.dtype("float16")) + _638: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_1_self_attn_k_proj_weight3, alloc638, alloc640) + R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight3) + gv1112: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape721: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc640, gv1112, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc640) + model_decoder_layers_1_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] + model_decoder_layers_1_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[515] + gv1113: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc641: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1113, R.dtype("float16")) + _639: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_v_proj_weight3, alloc638, model_decoder_layers_1_self_attn_v_proj_bias3, alloc641) + R.vm.kill_object(alloc638) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias3) + gv1114: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape722: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc641, gv1114, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc641) + gv1115: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc642: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1115, R.dtype("float16")) + cls.concatenate(reshape720, reshape721, reshape722, alloc642) + R.vm.kill_object(reshape720) + R.vm.kill_object(reshape721) + R.vm.kill_object(reshape722) + gv1116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape723: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc642, gv1116, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc642) + gv1117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc643: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1117, R.dtype("float16")) + _641: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape723, alloc643) + R.vm.kill_object(reshape723) + gv1118: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape724: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc643, gv1118, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc643) + gv1119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape725: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape724, gv1119, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape724) + model_decoder_layers_1_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] + model_decoder_layers_1_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[519] + gv1120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc644: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1120, R.dtype("float16")) + _642: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_self_attn_out_proj_weight3, reshape725, model_decoder_layers_1_self_attn_out_proj_bias3, alloc644) + R.vm.kill_object(reshape725) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias3) + gv1121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc645: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1121, R.dtype("float16")) + cls.add(alloc637, alloc644, alloc645) + R.vm.kill_object(alloc637) + R.vm.kill_object(alloc644) + model_decoder_layers_1_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[529] + model_decoder_layers_1_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[530] + gv1122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc646: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1122, R.dtype("float16")) + cls.layer_norm(alloc645, model_decoder_layers_1_encoder_attn_layer_norm_weight3, model_decoder_layers_1_encoder_attn_layer_norm_bias3, alloc646) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias3) + model_decoder_layers_1_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] + model_decoder_layers_1_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[526] + gv1123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc647: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1123, R.dtype("float16")) + _645: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight3, alloc646, model_decoder_layers_1_encoder_attn_q_proj_bias3, alloc647) + R.vm.kill_object(alloc646) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias3) + gv1124: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape726: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc647, gv1124, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc647) + gv1125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape727: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape726, gv1125, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape726) + gv1126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc648: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1126, R.dtype("float16")) + _646: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape727, alloc648) + R.vm.kill_object(reshape727) + gv1127: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape728: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc648, gv1127, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc648) + gv1128: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape729: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape728, gv1128, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape728) + model_decoder_layers_1_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] + model_decoder_layers_1_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[528] + gv1129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc649: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1129, R.dtype("float16")) + _647: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight3, reshape729, model_decoder_layers_1_encoder_attn_out_proj_bias3, alloc649) + R.vm.kill_object(reshape729) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias3) + gv1130: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc650: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1130, R.dtype("float16")) + cls.add(alloc645, alloc649, alloc650) + R.vm.kill_object(alloc645) + R.vm.kill_object(alloc649) + model_decoder_layers_1_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[535] + model_decoder_layers_1_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[536] + gv1131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc651: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1131, R.dtype("float16")) + cls.layer_norm(alloc650, model_decoder_layers_1_final_layer_norm_weight3, model_decoder_layers_1_final_layer_norm_bias3, alloc651) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias3) + model_decoder_layers_1_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] + model_decoder_layers_1_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[532] + gv1132: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc652: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1132, R.dtype("float16")) + _650: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_1_fc1_weight3, alloc651, model_decoder_layers_1_fc1_bias3, alloc652) + R.vm.kill_object(alloc651) + R.vm.kill_object(model_decoder_layers_1_fc1_weight3) + R.vm.kill_object(model_decoder_layers_1_fc1_bias3) + model_decoder_layers_1_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] + model_decoder_layers_1_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[534] + gv1133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc653: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1133, R.dtype("float16")) + _651: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_1_fc2_weight3, alloc652, model_decoder_layers_1_fc2_bias3, alloc653) + R.vm.kill_object(alloc652) + R.vm.kill_object(model_decoder_layers_1_fc2_weight3) + R.vm.kill_object(model_decoder_layers_1_fc2_bias3) + gv1134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc654: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1134, R.dtype("float16")) + cls.add(alloc650, alloc653, alloc654) + R.vm.kill_object(alloc650) + R.vm.kill_object(alloc653) + model_decoder_layers_2_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[544] + model_decoder_layers_2_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[545] + gv1135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc655: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1135, R.dtype("float16")) + cls.layer_norm(alloc654, model_decoder_layers_2_self_attn_layer_norm_weight3, model_decoder_layers_2_self_attn_layer_norm_bias3, alloc655) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias3) + model_decoder_layers_2_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] + model_decoder_layers_2_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[541] + gv1136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc656: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1136, R.dtype("float16")) + _654: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_q_proj_weight3, alloc655, model_decoder_layers_2_self_attn_q_proj_bias3, alloc656) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias3) + gv1137: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape730: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc656, gv1137, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc656) + model_decoder_layers_2_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] + gv1138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc657: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1138, R.dtype("float16")) + _655: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_2_self_attn_k_proj_weight3, alloc655, alloc657) + R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight3) + gv1139: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape731: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc657, gv1139, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc657) + model_decoder_layers_2_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] + model_decoder_layers_2_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[539] + gv1140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc658: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1140, R.dtype("float16")) + _656: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_v_proj_weight3, alloc655, model_decoder_layers_2_self_attn_v_proj_bias3, alloc658) + R.vm.kill_object(alloc655) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias3) + gv1141: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape732: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc658, gv1141, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc658) + gv1142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc659: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1142, R.dtype("float16")) + cls.concatenate(reshape730, reshape731, reshape732, alloc659) + R.vm.kill_object(reshape730) + R.vm.kill_object(reshape731) + R.vm.kill_object(reshape732) + gv1143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape733: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc659, gv1143, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc659) + gv1144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc660: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1144, R.dtype("float16")) + _658: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape733, alloc660) + R.vm.kill_object(reshape733) + gv1145: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape734: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc660, gv1145, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc660) + gv1146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape735: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape734, gv1146, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape734) + model_decoder_layers_2_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] + model_decoder_layers_2_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[543] + gv1147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc661: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1147, R.dtype("float16")) + _659: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_self_attn_out_proj_weight3, reshape735, model_decoder_layers_2_self_attn_out_proj_bias3, alloc661) + R.vm.kill_object(reshape735) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias3) + gv1148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc662: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1148, R.dtype("float16")) + cls.add(alloc654, alloc661, alloc662) + R.vm.kill_object(alloc654) + R.vm.kill_object(alloc661) + model_decoder_layers_2_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[553] + model_decoder_layers_2_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[554] + gv1149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc663: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1149, R.dtype("float16")) + cls.layer_norm(alloc662, model_decoder_layers_2_encoder_attn_layer_norm_weight3, model_decoder_layers_2_encoder_attn_layer_norm_bias3, alloc663) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias3) + model_decoder_layers_2_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] + model_decoder_layers_2_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[550] + gv1150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc664: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1150, R.dtype("float16")) + _662: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight3, alloc663, model_decoder_layers_2_encoder_attn_q_proj_bias3, alloc664) + R.vm.kill_object(alloc663) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias3) + gv1151: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape736: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc664, gv1151, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc664) + gv1152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape737: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape736, gv1152, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape736) + gv1153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc665: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1153, R.dtype("float16")) + _663: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape737, alloc665) + R.vm.kill_object(reshape737) + gv1154: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape738: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc665, gv1154, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc665) + gv1155: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape739: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape738, gv1155, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape738) + model_decoder_layers_2_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] + model_decoder_layers_2_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[552] + gv1156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc666: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1156, R.dtype("float16")) + _664: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight3, reshape739, model_decoder_layers_2_encoder_attn_out_proj_bias3, alloc666) + R.vm.kill_object(reshape739) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias3) + gv1157: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc667: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1157, R.dtype("float16")) + cls.add(alloc662, alloc666, alloc667) + R.vm.kill_object(alloc662) + R.vm.kill_object(alloc666) + model_decoder_layers_2_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[559] + model_decoder_layers_2_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[560] + gv1158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc668: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1158, R.dtype("float16")) + cls.layer_norm(alloc667, model_decoder_layers_2_final_layer_norm_weight3, model_decoder_layers_2_final_layer_norm_bias3, alloc668) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias3) + model_decoder_layers_2_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] + model_decoder_layers_2_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[556] + gv1159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc669: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1159, R.dtype("float16")) + _667: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_2_fc1_weight3, alloc668, model_decoder_layers_2_fc1_bias3, alloc669) + R.vm.kill_object(alloc668) + R.vm.kill_object(model_decoder_layers_2_fc1_weight3) + R.vm.kill_object(model_decoder_layers_2_fc1_bias3) + model_decoder_layers_2_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] + model_decoder_layers_2_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[558] + gv1160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc670: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1160, R.dtype("float16")) + _668: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_2_fc2_weight3, alloc669, model_decoder_layers_2_fc2_bias3, alloc670) + R.vm.kill_object(alloc669) + R.vm.kill_object(model_decoder_layers_2_fc2_weight3) + R.vm.kill_object(model_decoder_layers_2_fc2_bias3) + gv1161: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc671: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1161, R.dtype("float16")) + cls.add(alloc667, alloc670, alloc671) + R.vm.kill_object(alloc667) + R.vm.kill_object(alloc670) + model_decoder_layers_3_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[568] + model_decoder_layers_3_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[569] + gv1162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc672: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1162, R.dtype("float16")) + cls.layer_norm(alloc671, model_decoder_layers_3_self_attn_layer_norm_weight3, model_decoder_layers_3_self_attn_layer_norm_bias3, alloc672) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias3) + model_decoder_layers_3_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] + model_decoder_layers_3_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[565] + gv1163: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc673: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1163, R.dtype("float16")) + _671: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_q_proj_weight3, alloc672, model_decoder_layers_3_self_attn_q_proj_bias3, alloc673) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias3) + gv1164: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape740: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc673, gv1164, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc673) + model_decoder_layers_3_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] + gv1165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc674: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1165, R.dtype("float16")) + _672: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_3_self_attn_k_proj_weight3, alloc672, alloc674) + R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight3) + gv1166: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape741: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc674, gv1166, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc674) + model_decoder_layers_3_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] + model_decoder_layers_3_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[563] + gv1167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc675: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1167, R.dtype("float16")) + _673: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_v_proj_weight3, alloc672, model_decoder_layers_3_self_attn_v_proj_bias3, alloc675) + R.vm.kill_object(alloc672) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias3) + gv1168: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape742: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc675, gv1168, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc675) + gv1169: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc676: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1169, R.dtype("float16")) + cls.concatenate(reshape740, reshape741, reshape742, alloc676) + R.vm.kill_object(reshape740) + R.vm.kill_object(reshape741) + R.vm.kill_object(reshape742) + gv1170: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape743: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc676, gv1170, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc676) + gv1171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc677: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1171, R.dtype("float16")) + _675: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape743, alloc677) + R.vm.kill_object(reshape743) + gv1172: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape744: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc677, gv1172, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc677) + gv1173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape745: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape744, gv1173, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape744) + model_decoder_layers_3_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] + model_decoder_layers_3_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[567] + gv1174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc678: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1174, R.dtype("float16")) + _676: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_self_attn_out_proj_weight3, reshape745, model_decoder_layers_3_self_attn_out_proj_bias3, alloc678) + R.vm.kill_object(reshape745) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias3) + gv1175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc679: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1175, R.dtype("float16")) + cls.add(alloc671, alloc678, alloc679) + R.vm.kill_object(alloc671) + R.vm.kill_object(alloc678) + model_decoder_layers_3_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[577] + model_decoder_layers_3_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[578] + gv1176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc680: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1176, R.dtype("float16")) + cls.layer_norm(alloc679, model_decoder_layers_3_encoder_attn_layer_norm_weight3, model_decoder_layers_3_encoder_attn_layer_norm_bias3, alloc680) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias3) + model_decoder_layers_3_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] + model_decoder_layers_3_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[574] + gv1177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc681: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1177, R.dtype("float16")) + _679: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight3, alloc680, model_decoder_layers_3_encoder_attn_q_proj_bias3, alloc681) + R.vm.kill_object(alloc680) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias3) + gv1178: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape746: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc681, gv1178, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc681) + gv1179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape747: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape746, gv1179, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape746) + gv1180: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc682: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1180, R.dtype("float16")) + _680: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape747, alloc682) + R.vm.kill_object(reshape747) + gv1181: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape748: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc682, gv1181, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc682) + gv1182: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape749: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape748, gv1182, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape748) + model_decoder_layers_3_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] + model_decoder_layers_3_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[576] + gv1183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc683: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1183, R.dtype("float16")) + _681: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight3, reshape749, model_decoder_layers_3_encoder_attn_out_proj_bias3, alloc683) + R.vm.kill_object(reshape749) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias3) + gv1184: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc684: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1184, R.dtype("float16")) + cls.add(alloc679, alloc683, alloc684) + R.vm.kill_object(alloc679) + R.vm.kill_object(alloc683) + model_decoder_layers_3_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[583] + model_decoder_layers_3_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[584] + gv1185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc685: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1185, R.dtype("float16")) + cls.layer_norm(alloc684, model_decoder_layers_3_final_layer_norm_weight3, model_decoder_layers_3_final_layer_norm_bias3, alloc685) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias3) + model_decoder_layers_3_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] + model_decoder_layers_3_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[580] + gv1186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc686: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1186, R.dtype("float16")) + _684: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_3_fc1_weight3, alloc685, model_decoder_layers_3_fc1_bias3, alloc686) + R.vm.kill_object(alloc685) + R.vm.kill_object(model_decoder_layers_3_fc1_weight3) + R.vm.kill_object(model_decoder_layers_3_fc1_bias3) + model_decoder_layers_3_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] + model_decoder_layers_3_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[582] + gv1187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc687: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1187, R.dtype("float16")) + _685: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_3_fc2_weight3, alloc686, model_decoder_layers_3_fc2_bias3, alloc687) + R.vm.kill_object(alloc686) + R.vm.kill_object(model_decoder_layers_3_fc2_weight3) + R.vm.kill_object(model_decoder_layers_3_fc2_bias3) + gv1188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc688: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1188, R.dtype("float16")) + cls.add(alloc684, alloc687, alloc688) + R.vm.kill_object(alloc684) + R.vm.kill_object(alloc687) + model_decoder_layers_4_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[592] + model_decoder_layers_4_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[593] + gv1189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc689: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1189, R.dtype("float16")) + cls.layer_norm(alloc688, model_decoder_layers_4_self_attn_layer_norm_weight3, model_decoder_layers_4_self_attn_layer_norm_bias3, alloc689) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias3) + model_decoder_layers_4_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] + model_decoder_layers_4_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[589] + gv1190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc690: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1190, R.dtype("float16")) + _688: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_q_proj_weight3, alloc689, model_decoder_layers_4_self_attn_q_proj_bias3, alloc690) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias3) + gv1191: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape750: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc690, gv1191, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc690) + model_decoder_layers_4_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] + gv1192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc691: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1192, R.dtype("float16")) + _689: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_4_self_attn_k_proj_weight3, alloc689, alloc691) + R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight3) + gv1193: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape751: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc691, gv1193, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc691) + model_decoder_layers_4_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] + model_decoder_layers_4_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[587] + gv1194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc692: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1194, R.dtype("float16")) + _690: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_v_proj_weight3, alloc689, model_decoder_layers_4_self_attn_v_proj_bias3, alloc692) + R.vm.kill_object(alloc689) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias3) + gv1195: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape752: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc692, gv1195, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc692) + gv1196: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc693: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1196, R.dtype("float16")) + cls.concatenate(reshape750, reshape751, reshape752, alloc693) + R.vm.kill_object(reshape750) + R.vm.kill_object(reshape751) + R.vm.kill_object(reshape752) + gv1197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape753: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc693, gv1197, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc693) + gv1198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc694: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1198, R.dtype("float16")) + _692: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape753, alloc694) + R.vm.kill_object(reshape753) + gv1199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape754: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc694, gv1199, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc694) + gv1200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape755: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape754, gv1200, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape754) + model_decoder_layers_4_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] + model_decoder_layers_4_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[591] + gv1201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc695: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1201, R.dtype("float16")) + _693: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_self_attn_out_proj_weight3, reshape755, model_decoder_layers_4_self_attn_out_proj_bias3, alloc695) + R.vm.kill_object(reshape755) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias3) + gv1202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc696: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1202, R.dtype("float16")) + cls.add(alloc688, alloc695, alloc696) + R.vm.kill_object(alloc688) + R.vm.kill_object(alloc695) + model_decoder_layers_4_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[601] + model_decoder_layers_4_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[602] + gv1203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc697: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1203, R.dtype("float16")) + cls.layer_norm(alloc696, model_decoder_layers_4_encoder_attn_layer_norm_weight3, model_decoder_layers_4_encoder_attn_layer_norm_bias3, alloc697) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias3) + model_decoder_layers_4_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] + model_decoder_layers_4_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[598] + gv1204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc698: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1204, R.dtype("float16")) + _696: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight3, alloc697, model_decoder_layers_4_encoder_attn_q_proj_bias3, alloc698) + R.vm.kill_object(alloc697) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias3) + gv1205: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape756: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc698, gv1205, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc698) + gv1206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape757: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape756, gv1206, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape756) + gv1207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc699: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1207, R.dtype("float16")) + _697: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape757, alloc699) + R.vm.kill_object(reshape757) + gv1208: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape758: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc699, gv1208, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc699) + gv1209: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape759: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape758, gv1209, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape758) + model_decoder_layers_4_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] + model_decoder_layers_4_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[600] + gv1210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc700: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1210, R.dtype("float16")) + _698: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight3, reshape759, model_decoder_layers_4_encoder_attn_out_proj_bias3, alloc700) + R.vm.kill_object(reshape759) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias3) + gv1211: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc701: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1211, R.dtype("float16")) + cls.add(alloc696, alloc700, alloc701) + R.vm.kill_object(alloc696) + R.vm.kill_object(alloc700) + model_decoder_layers_4_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[607] + model_decoder_layers_4_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[608] + gv1212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc702: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1212, R.dtype("float16")) + cls.layer_norm(alloc701, model_decoder_layers_4_final_layer_norm_weight3, model_decoder_layers_4_final_layer_norm_bias3, alloc702) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias3) + model_decoder_layers_4_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] + model_decoder_layers_4_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[604] + gv1213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc703: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1213, R.dtype("float16")) + _701: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_4_fc1_weight3, alloc702, model_decoder_layers_4_fc1_bias3, alloc703) + R.vm.kill_object(alloc702) + R.vm.kill_object(model_decoder_layers_4_fc1_weight3) + R.vm.kill_object(model_decoder_layers_4_fc1_bias3) + model_decoder_layers_4_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] + model_decoder_layers_4_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[606] + gv1214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc704: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1214, R.dtype("float16")) + _702: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_4_fc2_weight3, alloc703, model_decoder_layers_4_fc2_bias3, alloc704) + R.vm.kill_object(alloc703) + R.vm.kill_object(model_decoder_layers_4_fc2_weight3) + R.vm.kill_object(model_decoder_layers_4_fc2_bias3) + gv1215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc705: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1215, R.dtype("float16")) + cls.add(alloc701, alloc704, alloc705) + R.vm.kill_object(alloc701) + R.vm.kill_object(alloc704) + model_decoder_layers_5_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[616] + model_decoder_layers_5_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[617] + gv1216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc706: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1216, R.dtype("float16")) + cls.layer_norm(alloc705, model_decoder_layers_5_self_attn_layer_norm_weight3, model_decoder_layers_5_self_attn_layer_norm_bias3, alloc706) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias3) + model_decoder_layers_5_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] + model_decoder_layers_5_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[613] + gv1217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc707: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1217, R.dtype("float16")) + _705: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_q_proj_weight3, alloc706, model_decoder_layers_5_self_attn_q_proj_bias3, alloc707) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias3) + gv1218: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape760: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc707, gv1218, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc707) + model_decoder_layers_5_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] + gv1219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc708: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1219, R.dtype("float16")) + _706: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_5_self_attn_k_proj_weight3, alloc706, alloc708) + R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight3) + gv1220: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape761: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc708, gv1220, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc708) + model_decoder_layers_5_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] + model_decoder_layers_5_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[611] + gv1221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc709: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1221, R.dtype("float16")) + _707: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_v_proj_weight3, alloc706, model_decoder_layers_5_self_attn_v_proj_bias3, alloc709) + R.vm.kill_object(alloc706) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias3) + gv1222: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape762: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc709, gv1222, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc709) + gv1223: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc710: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1223, R.dtype("float16")) + cls.concatenate(reshape760, reshape761, reshape762, alloc710) + R.vm.kill_object(reshape760) + R.vm.kill_object(reshape761) + R.vm.kill_object(reshape762) + gv1224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape763: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc710, gv1224, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc710) + gv1225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc711: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1225, R.dtype("float16")) + _709: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape763, alloc711) + R.vm.kill_object(reshape763) + gv1226: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape764: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc711, gv1226, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc711) + gv1227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape765: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape764, gv1227, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape764) + model_decoder_layers_5_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] + model_decoder_layers_5_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[615] + gv1228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc712: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1228, R.dtype("float16")) + _710: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_self_attn_out_proj_weight3, reshape765, model_decoder_layers_5_self_attn_out_proj_bias3, alloc712) + R.vm.kill_object(reshape765) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias3) + gv1229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc713: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1229, R.dtype("float16")) + cls.add(alloc705, alloc712, alloc713) + R.vm.kill_object(alloc705) + R.vm.kill_object(alloc712) + model_decoder_layers_5_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[625] + model_decoder_layers_5_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[626] + gv1230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc714: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1230, R.dtype("float16")) + cls.layer_norm(alloc713, model_decoder_layers_5_encoder_attn_layer_norm_weight3, model_decoder_layers_5_encoder_attn_layer_norm_bias3, alloc714) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias3) + model_decoder_layers_5_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] + model_decoder_layers_5_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[622] + gv1231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc715: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1231, R.dtype("float16")) + _713: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight3, alloc714, model_decoder_layers_5_encoder_attn_q_proj_bias3, alloc715) + R.vm.kill_object(alloc714) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias3) + gv1232: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape766: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc715, gv1232, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc715) + gv1233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape767: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape766, gv1233, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape766) + gv1234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc716: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1234, R.dtype("float16")) + _714: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape767, alloc716) + R.vm.kill_object(reshape767) + gv1235: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape768: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc716, gv1235, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc716) + gv1236: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape769: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape768, gv1236, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape768) + model_decoder_layers_5_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] + model_decoder_layers_5_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[624] + gv1237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc717: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1237, R.dtype("float16")) + _715: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight3, reshape769, model_decoder_layers_5_encoder_attn_out_proj_bias3, alloc717) + R.vm.kill_object(reshape769) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias3) + gv1238: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc718: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1238, R.dtype("float16")) + cls.add(alloc713, alloc717, alloc718) + R.vm.kill_object(alloc713) + R.vm.kill_object(alloc717) + model_decoder_layers_5_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[631] + model_decoder_layers_5_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[632] + gv1239: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc719: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1239, R.dtype("float16")) + cls.layer_norm(alloc718, model_decoder_layers_5_final_layer_norm_weight3, model_decoder_layers_5_final_layer_norm_bias3, alloc719) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias3) + model_decoder_layers_5_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] + model_decoder_layers_5_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[628] + gv1240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc720: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1240, R.dtype("float16")) + _718: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_5_fc1_weight3, alloc719, model_decoder_layers_5_fc1_bias3, alloc720) + R.vm.kill_object(alloc719) + R.vm.kill_object(model_decoder_layers_5_fc1_weight3) + R.vm.kill_object(model_decoder_layers_5_fc1_bias3) + model_decoder_layers_5_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] + model_decoder_layers_5_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[630] + gv1241: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc721: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1241, R.dtype("float16")) + _719: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_5_fc2_weight3, alloc720, model_decoder_layers_5_fc2_bias3, alloc721) + R.vm.kill_object(alloc720) + R.vm.kill_object(model_decoder_layers_5_fc2_weight3) + R.vm.kill_object(model_decoder_layers_5_fc2_bias3) + gv1242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc722: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1242, R.dtype("float16")) + cls.add(alloc718, alloc721, alloc722) + R.vm.kill_object(alloc718) + R.vm.kill_object(alloc721) + model_decoder_layers_6_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[640] + model_decoder_layers_6_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[641] + gv1243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc723: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1243, R.dtype("float16")) + cls.layer_norm(alloc722, model_decoder_layers_6_self_attn_layer_norm_weight3, model_decoder_layers_6_self_attn_layer_norm_bias3, alloc723) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias3) + model_decoder_layers_6_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] + model_decoder_layers_6_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[637] + gv1244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc724: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1244, R.dtype("float16")) + _722: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_q_proj_weight3, alloc723, model_decoder_layers_6_self_attn_q_proj_bias3, alloc724) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias3) + gv1245: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape770: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc724, gv1245, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc724) + model_decoder_layers_6_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] + gv1246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc725: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1246, R.dtype("float16")) + _723: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_6_self_attn_k_proj_weight3, alloc723, alloc725) + R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight3) + gv1247: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape771: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc725, gv1247, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc725) + model_decoder_layers_6_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] + model_decoder_layers_6_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[635] + gv1248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc726: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1248, R.dtype("float16")) + _724: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_v_proj_weight3, alloc723, model_decoder_layers_6_self_attn_v_proj_bias3, alloc726) + R.vm.kill_object(alloc723) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias3) + gv1249: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape772: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc726, gv1249, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc726) + gv1250: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc727: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1250, R.dtype("float16")) + cls.concatenate(reshape770, reshape771, reshape772, alloc727) + R.vm.kill_object(reshape770) + R.vm.kill_object(reshape771) + R.vm.kill_object(reshape772) + gv1251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape773: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc727, gv1251, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc727) + gv1252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc728: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1252, R.dtype("float16")) + _726: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape773, alloc728) + R.vm.kill_object(reshape773) + gv1253: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape774: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc728, gv1253, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc728) + gv1254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape775: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape774, gv1254, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape774) + model_decoder_layers_6_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] + model_decoder_layers_6_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[639] + gv1255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc729: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1255, R.dtype("float16")) + _727: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_self_attn_out_proj_weight3, reshape775, model_decoder_layers_6_self_attn_out_proj_bias3, alloc729) + R.vm.kill_object(reshape775) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias3) + gv1256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc730: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1256, R.dtype("float16")) + cls.add(alloc722, alloc729, alloc730) + R.vm.kill_object(alloc722) + R.vm.kill_object(alloc729) + model_decoder_layers_6_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[649] + model_decoder_layers_6_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[650] + gv1257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc731: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1257, R.dtype("float16")) + cls.layer_norm(alloc730, model_decoder_layers_6_encoder_attn_layer_norm_weight3, model_decoder_layers_6_encoder_attn_layer_norm_bias3, alloc731) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias3) + model_decoder_layers_6_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] + model_decoder_layers_6_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[646] + gv1258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc732: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1258, R.dtype("float16")) + _730: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight3, alloc731, model_decoder_layers_6_encoder_attn_q_proj_bias3, alloc732) + R.vm.kill_object(alloc731) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias3) + gv1259: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape776: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc732, gv1259, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc732) + gv1260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape777: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape776, gv1260, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape776) + gv1261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc733: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1261, R.dtype("float16")) + _731: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape777, alloc733) + R.vm.kill_object(reshape777) + gv1262: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape778: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc733, gv1262, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc733) + gv1263: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape779: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape778, gv1263, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape778) + model_decoder_layers_6_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] + model_decoder_layers_6_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[648] + gv1264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc734: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1264, R.dtype("float16")) + _732: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight3, reshape779, model_decoder_layers_6_encoder_attn_out_proj_bias3, alloc734) + R.vm.kill_object(reshape779) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias3) + gv1265: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc735: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1265, R.dtype("float16")) + cls.add(alloc730, alloc734, alloc735) + R.vm.kill_object(alloc730) + R.vm.kill_object(alloc734) + model_decoder_layers_6_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[655] + model_decoder_layers_6_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[656] + gv1266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc736: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1266, R.dtype("float16")) + cls.layer_norm(alloc735, model_decoder_layers_6_final_layer_norm_weight3, model_decoder_layers_6_final_layer_norm_bias3, alloc736) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias3) + model_decoder_layers_6_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] + model_decoder_layers_6_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[652] + gv1267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc737: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1267, R.dtype("float16")) + _735: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_6_fc1_weight3, alloc736, model_decoder_layers_6_fc1_bias3, alloc737) + R.vm.kill_object(alloc736) + R.vm.kill_object(model_decoder_layers_6_fc1_weight3) + R.vm.kill_object(model_decoder_layers_6_fc1_bias3) + model_decoder_layers_6_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] + model_decoder_layers_6_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[654] + gv1268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc738: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1268, R.dtype("float16")) + _736: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_6_fc2_weight3, alloc737, model_decoder_layers_6_fc2_bias3, alloc738) + R.vm.kill_object(alloc737) + R.vm.kill_object(model_decoder_layers_6_fc2_weight3) + R.vm.kill_object(model_decoder_layers_6_fc2_bias3) + gv1269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc739: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1269, R.dtype("float16")) + cls.add(alloc735, alloc738, alloc739) + R.vm.kill_object(alloc735) + R.vm.kill_object(alloc738) + model_decoder_layers_7_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[664] + model_decoder_layers_7_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[665] + gv1270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc740: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1270, R.dtype("float16")) + cls.layer_norm(alloc739, model_decoder_layers_7_self_attn_layer_norm_weight3, model_decoder_layers_7_self_attn_layer_norm_bias3, alloc740) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias3) + model_decoder_layers_7_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] + model_decoder_layers_7_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[661] + gv1271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc741: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1271, R.dtype("float16")) + _739: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_q_proj_weight3, alloc740, model_decoder_layers_7_self_attn_q_proj_bias3, alloc741) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias3) + gv1272: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape780: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc741, gv1272, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc741) + model_decoder_layers_7_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] + gv1273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc742: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1273, R.dtype("float16")) + _740: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_7_self_attn_k_proj_weight3, alloc740, alloc742) + R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight3) + gv1274: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape781: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc742, gv1274, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc742) + model_decoder_layers_7_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] + model_decoder_layers_7_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[659] + gv1275: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc743: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1275, R.dtype("float16")) + _741: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_v_proj_weight3, alloc740, model_decoder_layers_7_self_attn_v_proj_bias3, alloc743) + R.vm.kill_object(alloc740) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias3) + gv1276: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape782: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc743, gv1276, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc743) + gv1277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc744: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1277, R.dtype("float16")) + cls.concatenate(reshape780, reshape781, reshape782, alloc744) + R.vm.kill_object(reshape780) + R.vm.kill_object(reshape781) + R.vm.kill_object(reshape782) + gv1278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape783: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc744, gv1278, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc744) + gv1279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc745: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1279, R.dtype("float16")) + _743: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape783, alloc745) + R.vm.kill_object(reshape783) + gv1280: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape784: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc745, gv1280, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc745) + gv1281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape785: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape784, gv1281, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape784) + model_decoder_layers_7_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] + model_decoder_layers_7_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[663] + gv1282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc746: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1282, R.dtype("float16")) + _744: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_self_attn_out_proj_weight3, reshape785, model_decoder_layers_7_self_attn_out_proj_bias3, alloc746) + R.vm.kill_object(reshape785) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias3) + gv1283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc747: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1283, R.dtype("float16")) + cls.add(alloc739, alloc746, alloc747) + R.vm.kill_object(alloc739) + R.vm.kill_object(alloc746) + model_decoder_layers_7_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[673] + model_decoder_layers_7_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[674] + gv1284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc748: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1284, R.dtype("float16")) + cls.layer_norm(alloc747, model_decoder_layers_7_encoder_attn_layer_norm_weight3, model_decoder_layers_7_encoder_attn_layer_norm_bias3, alloc748) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias3) + model_decoder_layers_7_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] + model_decoder_layers_7_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[670] + gv1285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc749: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1285, R.dtype("float16")) + _747: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight3, alloc748, model_decoder_layers_7_encoder_attn_q_proj_bias3, alloc749) + R.vm.kill_object(alloc748) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias3) + gv1286: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape786: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc749, gv1286, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc749) + gv1287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape787: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape786, gv1287, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape786) + gv1288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc750: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1288, R.dtype("float16")) + _748: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape787, alloc750) + R.vm.kill_object(reshape787) + gv1289: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape788: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc750, gv1289, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc750) + gv1290: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape789: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape788, gv1290, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape788) + model_decoder_layers_7_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] + model_decoder_layers_7_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[672] + gv1291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc751: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1291, R.dtype("float16")) + _749: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight3, reshape789, model_decoder_layers_7_encoder_attn_out_proj_bias3, alloc751) + R.vm.kill_object(reshape789) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias3) + gv1292: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc752: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1292, R.dtype("float16")) + cls.add(alloc747, alloc751, alloc752) + R.vm.kill_object(alloc747) + R.vm.kill_object(alloc751) + model_decoder_layers_7_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[679] + model_decoder_layers_7_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[680] + gv1293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc753: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1293, R.dtype("float16")) + cls.layer_norm(alloc752, model_decoder_layers_7_final_layer_norm_weight3, model_decoder_layers_7_final_layer_norm_bias3, alloc753) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias3) + model_decoder_layers_7_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] + model_decoder_layers_7_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[676] + gv1294: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc754: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1294, R.dtype("float16")) + _752: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_7_fc1_weight3, alloc753, model_decoder_layers_7_fc1_bias3, alloc754) + R.vm.kill_object(alloc753) + R.vm.kill_object(model_decoder_layers_7_fc1_weight3) + R.vm.kill_object(model_decoder_layers_7_fc1_bias3) + model_decoder_layers_7_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] + model_decoder_layers_7_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[678] + gv1295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc755: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1295, R.dtype("float16")) + _753: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_7_fc2_weight3, alloc754, model_decoder_layers_7_fc2_bias3, alloc755) + R.vm.kill_object(alloc754) + R.vm.kill_object(model_decoder_layers_7_fc2_weight3) + R.vm.kill_object(model_decoder_layers_7_fc2_bias3) + gv1296: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc756: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1296, R.dtype("float16")) + cls.add(alloc752, alloc755, alloc756) + R.vm.kill_object(alloc752) + R.vm.kill_object(alloc755) + model_decoder_layers_8_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[688] + model_decoder_layers_8_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[689] + gv1297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc757: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1297, R.dtype("float16")) + cls.layer_norm(alloc756, model_decoder_layers_8_self_attn_layer_norm_weight3, model_decoder_layers_8_self_attn_layer_norm_bias3, alloc757) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias3) + model_decoder_layers_8_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] + model_decoder_layers_8_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[685] + gv1298: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc758: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1298, R.dtype("float16")) + _756: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_q_proj_weight3, alloc757, model_decoder_layers_8_self_attn_q_proj_bias3, alloc758) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias3) + gv1299: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape790: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc758, gv1299, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc758) + model_decoder_layers_8_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] + gv1300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc759: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1300, R.dtype("float16")) + _757: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_8_self_attn_k_proj_weight3, alloc757, alloc759) + R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight3) + gv1301: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape791: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc759, gv1301, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc759) + model_decoder_layers_8_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] + model_decoder_layers_8_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[683] + gv1302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc760: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1302, R.dtype("float16")) + _758: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_v_proj_weight3, alloc757, model_decoder_layers_8_self_attn_v_proj_bias3, alloc760) + R.vm.kill_object(alloc757) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias3) + gv1303: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape792: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc760, gv1303, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc760) + gv1304: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc761: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1304, R.dtype("float16")) + cls.concatenate(reshape790, reshape791, reshape792, alloc761) + R.vm.kill_object(reshape790) + R.vm.kill_object(reshape791) + R.vm.kill_object(reshape792) + gv1305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape793: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc761, gv1305, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc761) + gv1306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc762: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1306, R.dtype("float16")) + _760: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape793, alloc762) + R.vm.kill_object(reshape793) + gv1307: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape794: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc762, gv1307, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc762) + gv1308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape795: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape794, gv1308, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape794) + model_decoder_layers_8_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] + model_decoder_layers_8_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[687] + gv1309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc763: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1309, R.dtype("float16")) + _761: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_self_attn_out_proj_weight3, reshape795, model_decoder_layers_8_self_attn_out_proj_bias3, alloc763) + R.vm.kill_object(reshape795) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias3) + gv1310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc764: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1310, R.dtype("float16")) + cls.add(alloc756, alloc763, alloc764) + R.vm.kill_object(alloc756) + R.vm.kill_object(alloc763) + model_decoder_layers_8_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[697] + model_decoder_layers_8_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[698] + gv1311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc765: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1311, R.dtype("float16")) + cls.layer_norm(alloc764, model_decoder_layers_8_encoder_attn_layer_norm_weight3, model_decoder_layers_8_encoder_attn_layer_norm_bias3, alloc765) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias3) + model_decoder_layers_8_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] + model_decoder_layers_8_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[694] + gv1312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc766: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1312, R.dtype("float16")) + _764: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight3, alloc765, model_decoder_layers_8_encoder_attn_q_proj_bias3, alloc766) + R.vm.kill_object(alloc765) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias3) + gv1313: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape796: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc766, gv1313, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc766) + gv1314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape797: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape796, gv1314, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape796) + gv1315: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc767: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1315, R.dtype("float16")) + _765: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape797, alloc767) + R.vm.kill_object(reshape797) + gv1316: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape798: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc767, gv1316, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc767) + gv1317: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape799: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape798, gv1317, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape798) + model_decoder_layers_8_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] + model_decoder_layers_8_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[696] + gv1318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc768: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1318, R.dtype("float16")) + _766: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight3, reshape799, model_decoder_layers_8_encoder_attn_out_proj_bias3, alloc768) + R.vm.kill_object(reshape799) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias3) + gv1319: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc769: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1319, R.dtype("float16")) + cls.add(alloc764, alloc768, alloc769) + R.vm.kill_object(alloc764) + R.vm.kill_object(alloc768) + model_decoder_layers_8_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[703] + model_decoder_layers_8_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[704] + gv1320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc770: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1320, R.dtype("float16")) + cls.layer_norm(alloc769, model_decoder_layers_8_final_layer_norm_weight3, model_decoder_layers_8_final_layer_norm_bias3, alloc770) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias3) + model_decoder_layers_8_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] + model_decoder_layers_8_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[700] + gv1321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc771: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1321, R.dtype("float16")) + _769: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_8_fc1_weight3, alloc770, model_decoder_layers_8_fc1_bias3, alloc771) + R.vm.kill_object(alloc770) + R.vm.kill_object(model_decoder_layers_8_fc1_weight3) + R.vm.kill_object(model_decoder_layers_8_fc1_bias3) + model_decoder_layers_8_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] + model_decoder_layers_8_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[702] + gv1322: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc772: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1322, R.dtype("float16")) + _770: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_8_fc2_weight3, alloc771, model_decoder_layers_8_fc2_bias3, alloc772) + R.vm.kill_object(alloc771) + R.vm.kill_object(model_decoder_layers_8_fc2_weight3) + R.vm.kill_object(model_decoder_layers_8_fc2_bias3) + gv1323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc773: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1323, R.dtype("float16")) + cls.add(alloc769, alloc772, alloc773) + R.vm.kill_object(alloc769) + R.vm.kill_object(alloc772) + model_decoder_layers_9_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[712] + model_decoder_layers_9_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[713] + gv1324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc774: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1324, R.dtype("float16")) + cls.layer_norm(alloc773, model_decoder_layers_9_self_attn_layer_norm_weight3, model_decoder_layers_9_self_attn_layer_norm_bias3, alloc774) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias3) + model_decoder_layers_9_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] + model_decoder_layers_9_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[709] + gv1325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc775: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1325, R.dtype("float16")) + _773: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_q_proj_weight3, alloc774, model_decoder_layers_9_self_attn_q_proj_bias3, alloc775) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias3) + gv1326: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape800: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc775, gv1326, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc775) + model_decoder_layers_9_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] + gv1327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc776: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1327, R.dtype("float16")) + _774: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_9_self_attn_k_proj_weight3, alloc774, alloc776) + R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight3) + gv1328: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape801: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc776, gv1328, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc776) + model_decoder_layers_9_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] + model_decoder_layers_9_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[707] + gv1329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc777: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1329, R.dtype("float16")) + _775: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_v_proj_weight3, alloc774, model_decoder_layers_9_self_attn_v_proj_bias3, alloc777) + R.vm.kill_object(alloc774) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias3) + gv1330: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape802: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc777, gv1330, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc777) + gv1331: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc778: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1331, R.dtype("float16")) + cls.concatenate(reshape800, reshape801, reshape802, alloc778) + R.vm.kill_object(reshape800) + R.vm.kill_object(reshape801) + R.vm.kill_object(reshape802) + gv1332: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape803: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc778, gv1332, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc778) + gv1333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc779: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1333, R.dtype("float16")) + _777: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape803, alloc779) + R.vm.kill_object(reshape803) + gv1334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape804: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc779, gv1334, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc779) + gv1335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape805: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape804, gv1335, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape804) + model_decoder_layers_9_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] + model_decoder_layers_9_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[711] + gv1336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc780: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1336, R.dtype("float16")) + _778: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_self_attn_out_proj_weight3, reshape805, model_decoder_layers_9_self_attn_out_proj_bias3, alloc780) + R.vm.kill_object(reshape805) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias3) + gv1337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc781: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1337, R.dtype("float16")) + cls.add(alloc773, alloc780, alloc781) + R.vm.kill_object(alloc773) + R.vm.kill_object(alloc780) + model_decoder_layers_9_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[721] + model_decoder_layers_9_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[722] + gv1338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc782: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1338, R.dtype("float16")) + cls.layer_norm(alloc781, model_decoder_layers_9_encoder_attn_layer_norm_weight3, model_decoder_layers_9_encoder_attn_layer_norm_bias3, alloc782) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias3) + model_decoder_layers_9_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] + model_decoder_layers_9_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[718] + gv1339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc783: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1339, R.dtype("float16")) + _781: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight3, alloc782, model_decoder_layers_9_encoder_attn_q_proj_bias3, alloc783) + R.vm.kill_object(alloc782) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias3) + gv1340: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape806: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc783, gv1340, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc783) + gv1341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape807: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape806, gv1341, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape806) + gv1342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc784: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1342, R.dtype("float16")) + _782: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape807, alloc784) + R.vm.kill_object(reshape807) + gv1343: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape808: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc784, gv1343, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc784) + gv1344: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape809: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape808, gv1344, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape808) + model_decoder_layers_9_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] + model_decoder_layers_9_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[720] + gv1345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc785: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1345, R.dtype("float16")) + _783: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight3, reshape809, model_decoder_layers_9_encoder_attn_out_proj_bias3, alloc785) + R.vm.kill_object(reshape809) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias3) + gv1346: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc786: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1346, R.dtype("float16")) + cls.add(alloc781, alloc785, alloc786) + R.vm.kill_object(alloc781) + R.vm.kill_object(alloc785) + model_decoder_layers_9_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[727] + model_decoder_layers_9_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[728] + gv1347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc787: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1347, R.dtype("float16")) + cls.layer_norm(alloc786, model_decoder_layers_9_final_layer_norm_weight3, model_decoder_layers_9_final_layer_norm_bias3, alloc787) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias3) + model_decoder_layers_9_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] + model_decoder_layers_9_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[724] + gv1348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc788: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1348, R.dtype("float16")) + _786: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_9_fc1_weight3, alloc787, model_decoder_layers_9_fc1_bias3, alloc788) + R.vm.kill_object(alloc787) + R.vm.kill_object(model_decoder_layers_9_fc1_weight3) + R.vm.kill_object(model_decoder_layers_9_fc1_bias3) + model_decoder_layers_9_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] + model_decoder_layers_9_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[726] + gv1349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc789: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1349, R.dtype("float16")) + _787: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_9_fc2_weight3, alloc788, model_decoder_layers_9_fc2_bias3, alloc789) + R.vm.kill_object(alloc788) + R.vm.kill_object(model_decoder_layers_9_fc2_weight3) + R.vm.kill_object(model_decoder_layers_9_fc2_bias3) + gv1350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc790: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1350, R.dtype("float16")) + cls.add(alloc786, alloc789, alloc790) + R.vm.kill_object(alloc786) + R.vm.kill_object(alloc789) + model_decoder_layers_10_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[736] + model_decoder_layers_10_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[737] + gv1351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc791: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1351, R.dtype("float16")) + cls.layer_norm(alloc790, model_decoder_layers_10_self_attn_layer_norm_weight3, model_decoder_layers_10_self_attn_layer_norm_bias3, alloc791) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias3) + model_decoder_layers_10_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] + model_decoder_layers_10_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[733] + gv1352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc792: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1352, R.dtype("float16")) + _790: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_q_proj_weight3, alloc791, model_decoder_layers_10_self_attn_q_proj_bias3, alloc792) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias3) + gv1353: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape810: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc792, gv1353, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc792) + model_decoder_layers_10_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] + gv1354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc793: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1354, R.dtype("float16")) + _791: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_10_self_attn_k_proj_weight3, alloc791, alloc793) + R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight3) + gv1355: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape811: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc793, gv1355, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc793) + model_decoder_layers_10_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] + model_decoder_layers_10_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[731] + gv1356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc794: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1356, R.dtype("float16")) + _792: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_v_proj_weight3, alloc791, model_decoder_layers_10_self_attn_v_proj_bias3, alloc794) + R.vm.kill_object(alloc791) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias3) + gv1357: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape812: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc794, gv1357, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc794) + gv1358: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc795: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1358, R.dtype("float16")) + cls.concatenate(reshape810, reshape811, reshape812, alloc795) + R.vm.kill_object(reshape810) + R.vm.kill_object(reshape811) + R.vm.kill_object(reshape812) + gv1359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape813: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc795, gv1359, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc795) + gv1360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc796: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1360, R.dtype("float16")) + _794: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape813, alloc796) + R.vm.kill_object(reshape813) + gv1361: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape814: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc796, gv1361, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc796) + gv1362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape815: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape814, gv1362, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape814) + model_decoder_layers_10_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] + model_decoder_layers_10_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[735] + gv1363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc797: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1363, R.dtype("float16")) + _795: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_self_attn_out_proj_weight3, reshape815, model_decoder_layers_10_self_attn_out_proj_bias3, alloc797) + R.vm.kill_object(reshape815) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias3) + gv1364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc798: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1364, R.dtype("float16")) + cls.add(alloc790, alloc797, alloc798) + R.vm.kill_object(alloc790) + R.vm.kill_object(alloc797) + model_decoder_layers_10_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[745] + model_decoder_layers_10_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[746] + gv1365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc799: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1365, R.dtype("float16")) + cls.layer_norm(alloc798, model_decoder_layers_10_encoder_attn_layer_norm_weight3, model_decoder_layers_10_encoder_attn_layer_norm_bias3, alloc799) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias3) + model_decoder_layers_10_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] + model_decoder_layers_10_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[742] + gv1366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc800: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1366, R.dtype("float16")) + _798: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight3, alloc799, model_decoder_layers_10_encoder_attn_q_proj_bias3, alloc800) + R.vm.kill_object(alloc799) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias3) + gv1367: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape816: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc800, gv1367, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc800) + gv1368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape817: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape816, gv1368, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape816) + gv1369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc801: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1369, R.dtype("float16")) + _799: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape817, alloc801) + R.vm.kill_object(reshape817) + gv1370: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape818: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc801, gv1370, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc801) + gv1371: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape819: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape818, gv1371, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape818) + model_decoder_layers_10_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] + model_decoder_layers_10_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[744] + gv1372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc802: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1372, R.dtype("float16")) + _800: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight3, reshape819, model_decoder_layers_10_encoder_attn_out_proj_bias3, alloc802) + R.vm.kill_object(reshape819) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias3) + gv1373: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc803: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1373, R.dtype("float16")) + cls.add(alloc798, alloc802, alloc803) + R.vm.kill_object(alloc798) + R.vm.kill_object(alloc802) + model_decoder_layers_10_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[751] + model_decoder_layers_10_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[752] + gv1374: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc804: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1374, R.dtype("float16")) + cls.layer_norm(alloc803, model_decoder_layers_10_final_layer_norm_weight3, model_decoder_layers_10_final_layer_norm_bias3, alloc804) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias3) + model_decoder_layers_10_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] + model_decoder_layers_10_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[748] + gv1375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc805: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1375, R.dtype("float16")) + _803: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_10_fc1_weight3, alloc804, model_decoder_layers_10_fc1_bias3, alloc805) + R.vm.kill_object(alloc804) + R.vm.kill_object(model_decoder_layers_10_fc1_weight3) + R.vm.kill_object(model_decoder_layers_10_fc1_bias3) + model_decoder_layers_10_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] + model_decoder_layers_10_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[750] + gv1376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc806: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1376, R.dtype("float16")) + _804: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_10_fc2_weight3, alloc805, model_decoder_layers_10_fc2_bias3, alloc806) + R.vm.kill_object(alloc805) + R.vm.kill_object(model_decoder_layers_10_fc2_weight3) + R.vm.kill_object(model_decoder_layers_10_fc2_bias3) + gv1377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc807: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1377, R.dtype("float16")) + cls.add(alloc803, alloc806, alloc807) + R.vm.kill_object(alloc803) + R.vm.kill_object(alloc806) + model_decoder_layers_11_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[760] + model_decoder_layers_11_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[761] + gv1378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc808: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1378, R.dtype("float16")) + cls.layer_norm(alloc807, model_decoder_layers_11_self_attn_layer_norm_weight3, model_decoder_layers_11_self_attn_layer_norm_bias3, alloc808) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias3) + model_decoder_layers_11_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] + model_decoder_layers_11_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[757] + gv1379: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc809: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1379, R.dtype("float16")) + _807: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_q_proj_weight3, alloc808, model_decoder_layers_11_self_attn_q_proj_bias3, alloc809) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias3) + gv1380: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape820: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc809, gv1380, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc809) + model_decoder_layers_11_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] + gv1381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc810: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1381, R.dtype("float16")) + _808: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_11_self_attn_k_proj_weight3, alloc808, alloc810) + R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight3) + gv1382: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape821: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc810, gv1382, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc810) + model_decoder_layers_11_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] + model_decoder_layers_11_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[755] + gv1383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc811: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1383, R.dtype("float16")) + _809: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_v_proj_weight3, alloc808, model_decoder_layers_11_self_attn_v_proj_bias3, alloc811) + R.vm.kill_object(alloc808) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias3) + gv1384: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape822: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc811, gv1384, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc811) + gv1385: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc812: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1385, R.dtype("float16")) + cls.concatenate(reshape820, reshape821, reshape822, alloc812) + R.vm.kill_object(reshape820) + R.vm.kill_object(reshape821) + R.vm.kill_object(reshape822) + gv1386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape823: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc812, gv1386, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc812) + gv1387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc813: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1387, R.dtype("float16")) + _811: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape823, alloc813) + R.vm.kill_object(reshape823) + gv1388: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape824: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc813, gv1388, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc813) + gv1389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape825: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape824, gv1389, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape824) + model_decoder_layers_11_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] + model_decoder_layers_11_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[759] + gv1390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc814: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1390, R.dtype("float16")) + _812: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_self_attn_out_proj_weight3, reshape825, model_decoder_layers_11_self_attn_out_proj_bias3, alloc814) + R.vm.kill_object(reshape825) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias3) + gv1391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc815: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1391, R.dtype("float16")) + cls.add(alloc807, alloc814, alloc815) + R.vm.kill_object(alloc807) + R.vm.kill_object(alloc814) + model_decoder_layers_11_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[769] + model_decoder_layers_11_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[770] + gv1392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc816: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1392, R.dtype("float16")) + cls.layer_norm(alloc815, model_decoder_layers_11_encoder_attn_layer_norm_weight3, model_decoder_layers_11_encoder_attn_layer_norm_bias3, alloc816) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias3) + model_decoder_layers_11_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] + model_decoder_layers_11_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[766] + gv1393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc817: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1393, R.dtype("float16")) + _815: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight3, alloc816, model_decoder_layers_11_encoder_attn_q_proj_bias3, alloc817) + R.vm.kill_object(alloc816) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias3) + gv1394: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape826: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc817, gv1394, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc817) + gv1395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape827: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape826, gv1395, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape826) + gv1396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc818: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1396, R.dtype("float16")) + _816: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape827, alloc818) + R.vm.kill_object(reshape827) + gv1397: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape828: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc818, gv1397, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc818) + gv1398: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape829: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape828, gv1398, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape828) + model_decoder_layers_11_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] + model_decoder_layers_11_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[768] + gv1399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc819: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1399, R.dtype("float16")) + _817: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight3, reshape829, model_decoder_layers_11_encoder_attn_out_proj_bias3, alloc819) + R.vm.kill_object(reshape829) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias3) + gv1400: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc820: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1400, R.dtype("float16")) + cls.add(alloc815, alloc819, alloc820) + R.vm.kill_object(alloc815) + R.vm.kill_object(alloc819) + model_decoder_layers_11_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[775] + model_decoder_layers_11_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[776] + gv1401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc821: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1401, R.dtype("float16")) + cls.layer_norm(alloc820, model_decoder_layers_11_final_layer_norm_weight3, model_decoder_layers_11_final_layer_norm_bias3, alloc821) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias3) + model_decoder_layers_11_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] + model_decoder_layers_11_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[772] + gv1402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc822: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1402, R.dtype("float16")) + _820: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_11_fc1_weight3, alloc821, model_decoder_layers_11_fc1_bias3, alloc822) + R.vm.kill_object(alloc821) + R.vm.kill_object(model_decoder_layers_11_fc1_weight3) + R.vm.kill_object(model_decoder_layers_11_fc1_bias3) + model_decoder_layers_11_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] + model_decoder_layers_11_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[774] + gv1403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc823: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1403, R.dtype("float16")) + _821: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_11_fc2_weight3, alloc822, model_decoder_layers_11_fc2_bias3, alloc823) + R.vm.kill_object(alloc822) + R.vm.kill_object(model_decoder_layers_11_fc2_weight3) + R.vm.kill_object(model_decoder_layers_11_fc2_bias3) + gv1404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc824: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1404, R.dtype("float16")) + cls.add(alloc820, alloc823, alloc824) + R.vm.kill_object(alloc820) + R.vm.kill_object(alloc823) + model_decoder_layers_12_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[784] + model_decoder_layers_12_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[785] + gv1405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc825: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1405, R.dtype("float16")) + cls.layer_norm(alloc824, model_decoder_layers_12_self_attn_layer_norm_weight3, model_decoder_layers_12_self_attn_layer_norm_bias3, alloc825) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias3) + model_decoder_layers_12_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] + model_decoder_layers_12_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[781] + gv1406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc826: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1406, R.dtype("float16")) + _824: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_q_proj_weight3, alloc825, model_decoder_layers_12_self_attn_q_proj_bias3, alloc826) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias3) + gv1407: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape830: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc826, gv1407, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc826) + model_decoder_layers_12_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] + gv1408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc827: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1408, R.dtype("float16")) + _825: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_12_self_attn_k_proj_weight3, alloc825, alloc827) + R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight3) + gv1409: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape831: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc827, gv1409, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc827) + model_decoder_layers_12_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] + model_decoder_layers_12_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[779] + gv1410: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc828: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1410, R.dtype("float16")) + _826: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_v_proj_weight3, alloc825, model_decoder_layers_12_self_attn_v_proj_bias3, alloc828) + R.vm.kill_object(alloc825) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias3) + gv1411: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape832: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc828, gv1411, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc828) + gv1412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc829: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1412, R.dtype("float16")) + cls.concatenate(reshape830, reshape831, reshape832, alloc829) + R.vm.kill_object(reshape830) + R.vm.kill_object(reshape831) + R.vm.kill_object(reshape832) + gv1413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape833: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc829, gv1413, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc829) + gv1414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc830: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1414, R.dtype("float16")) + _828: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape833, alloc830) + R.vm.kill_object(reshape833) + gv1415: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape834: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc830, gv1415, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc830) + gv1416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape835: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape834, gv1416, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape834) + model_decoder_layers_12_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] + model_decoder_layers_12_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[783] + gv1417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc831: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1417, R.dtype("float16")) + _829: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_self_attn_out_proj_weight3, reshape835, model_decoder_layers_12_self_attn_out_proj_bias3, alloc831) + R.vm.kill_object(reshape835) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias3) + gv1418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc832: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1418, R.dtype("float16")) + cls.add(alloc824, alloc831, alloc832) + R.vm.kill_object(alloc824) + R.vm.kill_object(alloc831) + model_decoder_layers_12_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[793] + model_decoder_layers_12_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[794] + gv1419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc833: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1419, R.dtype("float16")) + cls.layer_norm(alloc832, model_decoder_layers_12_encoder_attn_layer_norm_weight3, model_decoder_layers_12_encoder_attn_layer_norm_bias3, alloc833) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias3) + model_decoder_layers_12_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] + model_decoder_layers_12_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[790] + gv1420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc834: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1420, R.dtype("float16")) + _832: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight3, alloc833, model_decoder_layers_12_encoder_attn_q_proj_bias3, alloc834) + R.vm.kill_object(alloc833) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias3) + gv1421: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape836: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc834, gv1421, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc834) + gv1422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape837: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape836, gv1422, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape836) + gv1423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc835: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1423, R.dtype("float16")) + _833: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape837, alloc835) + R.vm.kill_object(reshape837) + gv1424: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape838: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc835, gv1424, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc835) + gv1425: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape839: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape838, gv1425, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape838) + model_decoder_layers_12_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] + model_decoder_layers_12_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[792] + gv1426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc836: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1426, R.dtype("float16")) + _834: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight3, reshape839, model_decoder_layers_12_encoder_attn_out_proj_bias3, alloc836) + R.vm.kill_object(reshape839) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias3) + gv1427: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc837: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1427, R.dtype("float16")) + cls.add(alloc832, alloc836, alloc837) + R.vm.kill_object(alloc832) + R.vm.kill_object(alloc836) + model_decoder_layers_12_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[799] + model_decoder_layers_12_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[800] + gv1428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc838: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1428, R.dtype("float16")) + cls.layer_norm(alloc837, model_decoder_layers_12_final_layer_norm_weight3, model_decoder_layers_12_final_layer_norm_bias3, alloc838) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias3) + model_decoder_layers_12_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] + model_decoder_layers_12_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[796] + gv1429: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc839: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1429, R.dtype("float16")) + _837: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_12_fc1_weight3, alloc838, model_decoder_layers_12_fc1_bias3, alloc839) + R.vm.kill_object(alloc838) + R.vm.kill_object(model_decoder_layers_12_fc1_weight3) + R.vm.kill_object(model_decoder_layers_12_fc1_bias3) + model_decoder_layers_12_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] + model_decoder_layers_12_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[798] + gv1430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc840: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1430, R.dtype("float16")) + _838: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_12_fc2_weight3, alloc839, model_decoder_layers_12_fc2_bias3, alloc840) + R.vm.kill_object(alloc839) + R.vm.kill_object(model_decoder_layers_12_fc2_weight3) + R.vm.kill_object(model_decoder_layers_12_fc2_bias3) + gv1431: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc841: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1431, R.dtype("float16")) + cls.add(alloc837, alloc840, alloc841) + R.vm.kill_object(alloc837) + R.vm.kill_object(alloc840) + model_decoder_layers_13_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[808] + model_decoder_layers_13_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[809] + gv1432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc842: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1432, R.dtype("float16")) + cls.layer_norm(alloc841, model_decoder_layers_13_self_attn_layer_norm_weight3, model_decoder_layers_13_self_attn_layer_norm_bias3, alloc842) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias3) + model_decoder_layers_13_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] + model_decoder_layers_13_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[805] + gv1433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc843: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1433, R.dtype("float16")) + _841: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_q_proj_weight3, alloc842, model_decoder_layers_13_self_attn_q_proj_bias3, alloc843) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias3) + gv1434: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape840: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc843, gv1434, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc843) + model_decoder_layers_13_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] + gv1435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc844: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1435, R.dtype("float16")) + _842: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_13_self_attn_k_proj_weight3, alloc842, alloc844) + R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight3) + gv1436: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape841: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc844, gv1436, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc844) + model_decoder_layers_13_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] + model_decoder_layers_13_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[803] + gv1437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc845: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1437, R.dtype("float16")) + _843: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_v_proj_weight3, alloc842, model_decoder_layers_13_self_attn_v_proj_bias3, alloc845) + R.vm.kill_object(alloc842) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias3) + gv1438: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape842: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc845, gv1438, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc845) + gv1439: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc846: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1439, R.dtype("float16")) + cls.concatenate(reshape840, reshape841, reshape842, alloc846) + R.vm.kill_object(reshape840) + R.vm.kill_object(reshape841) + R.vm.kill_object(reshape842) + gv1440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape843: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc846, gv1440, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc846) + gv1441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc847: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1441, R.dtype("float16")) + _845: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape843, alloc847) + R.vm.kill_object(reshape843) + gv1442: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape844: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc847, gv1442, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc847) + gv1443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape845: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape844, gv1443, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape844) + model_decoder_layers_13_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] + model_decoder_layers_13_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[807] + gv1444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc848: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1444, R.dtype("float16")) + _846: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_self_attn_out_proj_weight3, reshape845, model_decoder_layers_13_self_attn_out_proj_bias3, alloc848) + R.vm.kill_object(reshape845) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias3) + gv1445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc849: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1445, R.dtype("float16")) + cls.add(alloc841, alloc848, alloc849) + R.vm.kill_object(alloc841) + R.vm.kill_object(alloc848) + model_decoder_layers_13_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[817] + model_decoder_layers_13_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[818] + gv1446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc850: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1446, R.dtype("float16")) + cls.layer_norm(alloc849, model_decoder_layers_13_encoder_attn_layer_norm_weight3, model_decoder_layers_13_encoder_attn_layer_norm_bias3, alloc850) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias3) + model_decoder_layers_13_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] + model_decoder_layers_13_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[814] + gv1447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc851: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1447, R.dtype("float16")) + _849: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight3, alloc850, model_decoder_layers_13_encoder_attn_q_proj_bias3, alloc851) + R.vm.kill_object(alloc850) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias3) + gv1448: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape846: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc851, gv1448, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc851) + gv1449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape847: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape846, gv1449, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape846) + gv1450: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc852: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1450, R.dtype("float16")) + _850: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape847, alloc852) + R.vm.kill_object(reshape847) + gv1451: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape848: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc852, gv1451, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc852) + gv1452: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape849: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape848, gv1452, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape848) + model_decoder_layers_13_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] + model_decoder_layers_13_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[816] + gv1453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc853: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1453, R.dtype("float16")) + _851: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight3, reshape849, model_decoder_layers_13_encoder_attn_out_proj_bias3, alloc853) + R.vm.kill_object(reshape849) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias3) + gv1454: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc854: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1454, R.dtype("float16")) + cls.add(alloc849, alloc853, alloc854) + R.vm.kill_object(alloc849) + R.vm.kill_object(alloc853) + model_decoder_layers_13_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[823] + model_decoder_layers_13_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[824] + gv1455: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc855: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1455, R.dtype("float16")) + cls.layer_norm(alloc854, model_decoder_layers_13_final_layer_norm_weight3, model_decoder_layers_13_final_layer_norm_bias3, alloc855) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias3) + model_decoder_layers_13_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] + model_decoder_layers_13_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[820] + gv1456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc856: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1456, R.dtype("float16")) + _854: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_13_fc1_weight3, alloc855, model_decoder_layers_13_fc1_bias3, alloc856) + R.vm.kill_object(alloc855) + R.vm.kill_object(model_decoder_layers_13_fc1_weight3) + R.vm.kill_object(model_decoder_layers_13_fc1_bias3) + model_decoder_layers_13_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] + model_decoder_layers_13_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[822] + gv1457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc857: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1457, R.dtype("float16")) + _855: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_13_fc2_weight3, alloc856, model_decoder_layers_13_fc2_bias3, alloc857) + R.vm.kill_object(alloc856) + R.vm.kill_object(model_decoder_layers_13_fc2_weight3) + R.vm.kill_object(model_decoder_layers_13_fc2_bias3) + gv1458: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc858: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1458, R.dtype("float16")) + cls.add(alloc854, alloc857, alloc858) + R.vm.kill_object(alloc854) + R.vm.kill_object(alloc857) + model_decoder_layers_14_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[832] + model_decoder_layers_14_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[833] + gv1459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc859: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1459, R.dtype("float16")) + cls.layer_norm(alloc858, model_decoder_layers_14_self_attn_layer_norm_weight3, model_decoder_layers_14_self_attn_layer_norm_bias3, alloc859) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias3) + model_decoder_layers_14_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] + model_decoder_layers_14_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[829] + gv1460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc860: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1460, R.dtype("float16")) + _858: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_q_proj_weight3, alloc859, model_decoder_layers_14_self_attn_q_proj_bias3, alloc860) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias3) + gv1461: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape850: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc860, gv1461, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc860) + model_decoder_layers_14_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] + gv1462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc861: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1462, R.dtype("float16")) + _859: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_14_self_attn_k_proj_weight3, alloc859, alloc861) + R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight3) + gv1463: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape851: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc861, gv1463, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc861) + model_decoder_layers_14_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] + model_decoder_layers_14_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[827] + gv1464: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc862: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1464, R.dtype("float16")) + _860: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_v_proj_weight3, alloc859, model_decoder_layers_14_self_attn_v_proj_bias3, alloc862) + R.vm.kill_object(alloc859) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias3) + gv1465: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape852: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc862, gv1465, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc862) + gv1466: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc863: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1466, R.dtype("float16")) + cls.concatenate(reshape850, reshape851, reshape852, alloc863) + R.vm.kill_object(reshape850) + R.vm.kill_object(reshape851) + R.vm.kill_object(reshape852) + gv1467: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape853: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc863, gv1467, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc863) + gv1468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc864: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1468, R.dtype("float16")) + _862: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape853, alloc864) + R.vm.kill_object(reshape853) + gv1469: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape854: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc864, gv1469, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc864) + gv1470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape855: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape854, gv1470, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape854) + model_decoder_layers_14_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] + model_decoder_layers_14_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[831] + gv1471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc865: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1471, R.dtype("float16")) + _863: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_self_attn_out_proj_weight3, reshape855, model_decoder_layers_14_self_attn_out_proj_bias3, alloc865) + R.vm.kill_object(reshape855) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias3) + gv1472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc866: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1472, R.dtype("float16")) + cls.add(alloc858, alloc865, alloc866) + R.vm.kill_object(alloc858) + R.vm.kill_object(alloc865) + model_decoder_layers_14_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[841] + model_decoder_layers_14_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[842] + gv1473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc867: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1473, R.dtype("float16")) + cls.layer_norm(alloc866, model_decoder_layers_14_encoder_attn_layer_norm_weight3, model_decoder_layers_14_encoder_attn_layer_norm_bias3, alloc867) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias3) + model_decoder_layers_14_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] + model_decoder_layers_14_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[838] + gv1474: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc868: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1474, R.dtype("float16")) + _866: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight3, alloc867, model_decoder_layers_14_encoder_attn_q_proj_bias3, alloc868) + R.vm.kill_object(alloc867) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias3) + gv1475: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape856: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc868, gv1475, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc868) + gv1476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape857: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape856, gv1476, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape856) + gv1477: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc869: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1477, R.dtype("float16")) + _867: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape857, alloc869) + R.vm.kill_object(reshape857) + gv1478: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape858: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc869, gv1478, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc869) + gv1479: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape859: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape858, gv1479, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape858) + model_decoder_layers_14_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] + model_decoder_layers_14_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[840] + gv1480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc870: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1480, R.dtype("float16")) + _868: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight3, reshape859, model_decoder_layers_14_encoder_attn_out_proj_bias3, alloc870) + R.vm.kill_object(reshape859) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias3) + gv1481: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc871: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1481, R.dtype("float16")) + cls.add(alloc866, alloc870, alloc871) + R.vm.kill_object(alloc866) + R.vm.kill_object(alloc870) + model_decoder_layers_14_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[847] + model_decoder_layers_14_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[848] + gv1482: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc872: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1482, R.dtype("float16")) + cls.layer_norm(alloc871, model_decoder_layers_14_final_layer_norm_weight3, model_decoder_layers_14_final_layer_norm_bias3, alloc872) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias3) + model_decoder_layers_14_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] + model_decoder_layers_14_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[844] + gv1483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc873: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1483, R.dtype("float16")) + _871: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_14_fc1_weight3, alloc872, model_decoder_layers_14_fc1_bias3, alloc873) + R.vm.kill_object(alloc872) + R.vm.kill_object(model_decoder_layers_14_fc1_weight3) + R.vm.kill_object(model_decoder_layers_14_fc1_bias3) + model_decoder_layers_14_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] + model_decoder_layers_14_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[846] + gv1484: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc874: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1484, R.dtype("float16")) + _872: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_14_fc2_weight3, alloc873, model_decoder_layers_14_fc2_bias3, alloc874) + R.vm.kill_object(alloc873) + R.vm.kill_object(model_decoder_layers_14_fc2_weight3) + R.vm.kill_object(model_decoder_layers_14_fc2_bias3) + gv1485: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc875: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1485, R.dtype("float16")) + cls.add(alloc871, alloc874, alloc875) + R.vm.kill_object(alloc871) + R.vm.kill_object(alloc874) + model_decoder_layers_15_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[856] + model_decoder_layers_15_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[857] + gv1486: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc876: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1486, R.dtype("float16")) + cls.layer_norm(alloc875, model_decoder_layers_15_self_attn_layer_norm_weight3, model_decoder_layers_15_self_attn_layer_norm_bias3, alloc876) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias3) + model_decoder_layers_15_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] + model_decoder_layers_15_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[853] + gv1487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc877: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1487, R.dtype("float16")) + _875: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_q_proj_weight3, alloc876, model_decoder_layers_15_self_attn_q_proj_bias3, alloc877) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias3) + gv1488: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape860: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc877, gv1488, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc877) + model_decoder_layers_15_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] + gv1489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc878: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1489, R.dtype("float16")) + _876: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_15_self_attn_k_proj_weight3, alloc876, alloc878) + R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight3) + gv1490: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape861: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc878, gv1490, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc878) + model_decoder_layers_15_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] + model_decoder_layers_15_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[851] + gv1491: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc879: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1491, R.dtype("float16")) + _877: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_v_proj_weight3, alloc876, model_decoder_layers_15_self_attn_v_proj_bias3, alloc879) + R.vm.kill_object(alloc876) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias3) + gv1492: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape862: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc879, gv1492, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc879) + gv1493: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc880: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1493, R.dtype("float16")) + cls.concatenate(reshape860, reshape861, reshape862, alloc880) + R.vm.kill_object(reshape860) + R.vm.kill_object(reshape861) + R.vm.kill_object(reshape862) + gv1494: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape863: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc880, gv1494, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc880) + gv1495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc881: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1495, R.dtype("float16")) + _879: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape863, alloc881) + R.vm.kill_object(reshape863) + gv1496: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape864: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc881, gv1496, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc881) + gv1497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape865: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape864, gv1497, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape864) + model_decoder_layers_15_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] + model_decoder_layers_15_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[855] + gv1498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc882: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1498, R.dtype("float16")) + _880: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_self_attn_out_proj_weight3, reshape865, model_decoder_layers_15_self_attn_out_proj_bias3, alloc882) + R.vm.kill_object(reshape865) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias3) + gv1499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc883: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1499, R.dtype("float16")) + cls.add(alloc875, alloc882, alloc883) + R.vm.kill_object(alloc875) + R.vm.kill_object(alloc882) + model_decoder_layers_15_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[865] + model_decoder_layers_15_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[866] + gv1500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc884: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1500, R.dtype("float16")) + cls.layer_norm(alloc883, model_decoder_layers_15_encoder_attn_layer_norm_weight3, model_decoder_layers_15_encoder_attn_layer_norm_bias3, alloc884) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias3) + model_decoder_layers_15_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] + model_decoder_layers_15_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[862] + gv1501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc885: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1501, R.dtype("float16")) + _883: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight3, alloc884, model_decoder_layers_15_encoder_attn_q_proj_bias3, alloc885) + R.vm.kill_object(alloc884) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias3) + gv1502: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape866: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc885, gv1502, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc885) + gv1503: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape867: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape866, gv1503, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape866) + gv1504: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc886: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1504, R.dtype("float16")) + _884: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape867, alloc886) + R.vm.kill_object(reshape867) + gv1505: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape868: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc886, gv1505, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc886) + gv1506: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape869: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape868, gv1506, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape868) + model_decoder_layers_15_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] + model_decoder_layers_15_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[864] + gv1507: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc887: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1507, R.dtype("float16")) + _885: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight3, reshape869, model_decoder_layers_15_encoder_attn_out_proj_bias3, alloc887) + R.vm.kill_object(reshape869) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias3) + gv1508: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc888: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1508, R.dtype("float16")) + cls.add(alloc883, alloc887, alloc888) + R.vm.kill_object(alloc883) + R.vm.kill_object(alloc887) + model_decoder_layers_15_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[871] + model_decoder_layers_15_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[872] + gv1509: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc889: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1509, R.dtype("float16")) + cls.layer_norm(alloc888, model_decoder_layers_15_final_layer_norm_weight3, model_decoder_layers_15_final_layer_norm_bias3, alloc889) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias3) + model_decoder_layers_15_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] + model_decoder_layers_15_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[868] + gv1510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc890: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1510, R.dtype("float16")) + _888: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_15_fc1_weight3, alloc889, model_decoder_layers_15_fc1_bias3, alloc890) + R.vm.kill_object(alloc889) + R.vm.kill_object(model_decoder_layers_15_fc1_weight3) + R.vm.kill_object(model_decoder_layers_15_fc1_bias3) + model_decoder_layers_15_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] + model_decoder_layers_15_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[870] + gv1511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc891: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1511, R.dtype("float16")) + _889: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_15_fc2_weight3, alloc890, model_decoder_layers_15_fc2_bias3, alloc891) + R.vm.kill_object(alloc890) + R.vm.kill_object(model_decoder_layers_15_fc2_weight3) + R.vm.kill_object(model_decoder_layers_15_fc2_bias3) + gv1512: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc892: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1512, R.dtype("float16")) + cls.add(alloc888, alloc891, alloc892) + R.vm.kill_object(alloc888) + R.vm.kill_object(alloc891) + model_decoder_layers_16_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[880] + model_decoder_layers_16_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[881] + gv1513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc893: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1513, R.dtype("float16")) + cls.layer_norm(alloc892, model_decoder_layers_16_self_attn_layer_norm_weight3, model_decoder_layers_16_self_attn_layer_norm_bias3, alloc893) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias3) + model_decoder_layers_16_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] + model_decoder_layers_16_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[877] + gv1514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc894: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1514, R.dtype("float16")) + _892: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_q_proj_weight3, alloc893, model_decoder_layers_16_self_attn_q_proj_bias3, alloc894) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias3) + gv1515: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape870: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc894, gv1515, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc894) + model_decoder_layers_16_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] + gv1516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc895: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1516, R.dtype("float16")) + _893: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_16_self_attn_k_proj_weight3, alloc893, alloc895) + R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight3) + gv1517: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape871: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc895, gv1517, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc895) + model_decoder_layers_16_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] + model_decoder_layers_16_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[875] + gv1518: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc896: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1518, R.dtype("float16")) + _894: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_v_proj_weight3, alloc893, model_decoder_layers_16_self_attn_v_proj_bias3, alloc896) + R.vm.kill_object(alloc893) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias3) + gv1519: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape872: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc896, gv1519, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc896) + gv1520: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc897: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1520, R.dtype("float16")) + cls.concatenate(reshape870, reshape871, reshape872, alloc897) + R.vm.kill_object(reshape870) + R.vm.kill_object(reshape871) + R.vm.kill_object(reshape872) + gv1521: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape873: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc897, gv1521, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc897) + gv1522: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc898: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1522, R.dtype("float16")) + _896: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape873, alloc898) + R.vm.kill_object(reshape873) + gv1523: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape874: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc898, gv1523, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc898) + gv1524: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape875: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape874, gv1524, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape874) + model_decoder_layers_16_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] + model_decoder_layers_16_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[879] + gv1525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc899: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1525, R.dtype("float16")) + _897: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_self_attn_out_proj_weight3, reshape875, model_decoder_layers_16_self_attn_out_proj_bias3, alloc899) + R.vm.kill_object(reshape875) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias3) + gv1526: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc900: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1526, R.dtype("float16")) + cls.add(alloc892, alloc899, alloc900) + R.vm.kill_object(alloc892) + R.vm.kill_object(alloc899) + model_decoder_layers_16_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[889] + model_decoder_layers_16_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[890] + gv1527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc901: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1527, R.dtype("float16")) + cls.layer_norm(alloc900, model_decoder_layers_16_encoder_attn_layer_norm_weight3, model_decoder_layers_16_encoder_attn_layer_norm_bias3, alloc901) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias3) + model_decoder_layers_16_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] + model_decoder_layers_16_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[886] + gv1528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc902: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1528, R.dtype("float16")) + _900: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight3, alloc901, model_decoder_layers_16_encoder_attn_q_proj_bias3, alloc902) + R.vm.kill_object(alloc901) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias3) + gv1529: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape876: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc902, gv1529, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc902) + gv1530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape877: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape876, gv1530, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape876) + gv1531: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc903: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1531, R.dtype("float16")) + _901: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape877, alloc903) + R.vm.kill_object(reshape877) + gv1532: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape878: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc903, gv1532, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc903) + gv1533: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape879: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape878, gv1533, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape878) + model_decoder_layers_16_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] + model_decoder_layers_16_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[888] + gv1534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc904: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1534, R.dtype("float16")) + _902: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight3, reshape879, model_decoder_layers_16_encoder_attn_out_proj_bias3, alloc904) + R.vm.kill_object(reshape879) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias3) + gv1535: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc905: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1535, R.dtype("float16")) + cls.add(alloc900, alloc904, alloc905) + R.vm.kill_object(alloc900) + R.vm.kill_object(alloc904) + model_decoder_layers_16_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[895] + model_decoder_layers_16_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[896] + gv1536: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc906: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1536, R.dtype("float16")) + cls.layer_norm(alloc905, model_decoder_layers_16_final_layer_norm_weight3, model_decoder_layers_16_final_layer_norm_bias3, alloc906) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias3) + model_decoder_layers_16_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] + model_decoder_layers_16_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[892] + gv1537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc907: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1537, R.dtype("float16")) + _905: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_16_fc1_weight3, alloc906, model_decoder_layers_16_fc1_bias3, alloc907) + R.vm.kill_object(alloc906) + R.vm.kill_object(model_decoder_layers_16_fc1_weight3) + R.vm.kill_object(model_decoder_layers_16_fc1_bias3) + model_decoder_layers_16_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] + model_decoder_layers_16_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[894] + gv1538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc908: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1538, R.dtype("float16")) + _906: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_16_fc2_weight3, alloc907, model_decoder_layers_16_fc2_bias3, alloc908) + R.vm.kill_object(alloc907) + R.vm.kill_object(model_decoder_layers_16_fc2_weight3) + R.vm.kill_object(model_decoder_layers_16_fc2_bias3) + gv1539: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc909: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1539, R.dtype("float16")) + cls.add(alloc905, alloc908, alloc909) + R.vm.kill_object(alloc905) + R.vm.kill_object(alloc908) + model_decoder_layers_17_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[904] + model_decoder_layers_17_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[905] + gv1540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc910: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1540, R.dtype("float16")) + cls.layer_norm(alloc909, model_decoder_layers_17_self_attn_layer_norm_weight3, model_decoder_layers_17_self_attn_layer_norm_bias3, alloc910) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias3) + model_decoder_layers_17_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] + model_decoder_layers_17_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[901] + gv1541: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc911: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1541, R.dtype("float16")) + _909: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_q_proj_weight3, alloc910, model_decoder_layers_17_self_attn_q_proj_bias3, alloc911) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias3) + gv1542: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape880: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc911, gv1542, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc911) + model_decoder_layers_17_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] + gv1543: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc912: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1543, R.dtype("float16")) + _910: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_17_self_attn_k_proj_weight3, alloc910, alloc912) + R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight3) + gv1544: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape881: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc912, gv1544, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc912) + model_decoder_layers_17_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] + model_decoder_layers_17_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[899] + gv1545: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc913: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1545, R.dtype("float16")) + _911: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_v_proj_weight3, alloc910, model_decoder_layers_17_self_attn_v_proj_bias3, alloc913) + R.vm.kill_object(alloc910) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias3) + gv1546: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape882: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc913, gv1546, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc913) + gv1547: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc914: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1547, R.dtype("float16")) + cls.concatenate(reshape880, reshape881, reshape882, alloc914) + R.vm.kill_object(reshape880) + R.vm.kill_object(reshape881) + R.vm.kill_object(reshape882) + gv1548: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape883: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc914, gv1548, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc914) + gv1549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc915: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1549, R.dtype("float16")) + _913: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape883, alloc915) + R.vm.kill_object(reshape883) + gv1550: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape884: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc915, gv1550, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc915) + gv1551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape885: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape884, gv1551, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape884) + model_decoder_layers_17_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] + model_decoder_layers_17_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[903] + gv1552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc916: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1552, R.dtype("float16")) + _914: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_self_attn_out_proj_weight3, reshape885, model_decoder_layers_17_self_attn_out_proj_bias3, alloc916) + R.vm.kill_object(reshape885) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias3) + gv1553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc917: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1553, R.dtype("float16")) + cls.add(alloc909, alloc916, alloc917) + R.vm.kill_object(alloc909) + R.vm.kill_object(alloc916) + model_decoder_layers_17_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[913] + model_decoder_layers_17_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[914] + gv1554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc918: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1554, R.dtype("float16")) + cls.layer_norm(alloc917, model_decoder_layers_17_encoder_attn_layer_norm_weight3, model_decoder_layers_17_encoder_attn_layer_norm_bias3, alloc918) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias3) + model_decoder_layers_17_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] + model_decoder_layers_17_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[910] + gv1555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc919: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1555, R.dtype("float16")) + _917: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight3, alloc918, model_decoder_layers_17_encoder_attn_q_proj_bias3, alloc919) + R.vm.kill_object(alloc918) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias3) + gv1556: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape886: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc919, gv1556, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc919) + gv1557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape887: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape886, gv1557, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape886) + gv1558: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc920: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1558, R.dtype("float16")) + _918: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape887, alloc920) + R.vm.kill_object(reshape887) + gv1559: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape888: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc920, gv1559, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc920) + gv1560: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape889: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape888, gv1560, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape888) + model_decoder_layers_17_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] + model_decoder_layers_17_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[912] + gv1561: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc921: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1561, R.dtype("float16")) + _919: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight3, reshape889, model_decoder_layers_17_encoder_attn_out_proj_bias3, alloc921) + R.vm.kill_object(reshape889) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias3) + gv1562: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc922: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1562, R.dtype("float16")) + cls.add(alloc917, alloc921, alloc922) + R.vm.kill_object(alloc917) + R.vm.kill_object(alloc921) + model_decoder_layers_17_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[919] + model_decoder_layers_17_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[920] + gv1563: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc923: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1563, R.dtype("float16")) + cls.layer_norm(alloc922, model_decoder_layers_17_final_layer_norm_weight3, model_decoder_layers_17_final_layer_norm_bias3, alloc923) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias3) + model_decoder_layers_17_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] + model_decoder_layers_17_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[916] + gv1564: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc924: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1564, R.dtype("float16")) + _922: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_17_fc1_weight3, alloc923, model_decoder_layers_17_fc1_bias3, alloc924) + R.vm.kill_object(alloc923) + R.vm.kill_object(model_decoder_layers_17_fc1_weight3) + R.vm.kill_object(model_decoder_layers_17_fc1_bias3) + model_decoder_layers_17_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] + model_decoder_layers_17_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[918] + gv1565: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc925: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1565, R.dtype("float16")) + _923: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_17_fc2_weight3, alloc924, model_decoder_layers_17_fc2_bias3, alloc925) + R.vm.kill_object(alloc924) + R.vm.kill_object(model_decoder_layers_17_fc2_weight3) + R.vm.kill_object(model_decoder_layers_17_fc2_bias3) + gv1566: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc926: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1566, R.dtype("float16")) + cls.add(alloc922, alloc925, alloc926) + R.vm.kill_object(alloc922) + R.vm.kill_object(alloc925) + model_decoder_layers_18_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[928] + model_decoder_layers_18_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[929] + gv1567: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc927: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1567, R.dtype("float16")) + cls.layer_norm(alloc926, model_decoder_layers_18_self_attn_layer_norm_weight3, model_decoder_layers_18_self_attn_layer_norm_bias3, alloc927) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias3) + model_decoder_layers_18_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] + model_decoder_layers_18_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[925] + gv1568: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc928: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1568, R.dtype("float16")) + _926: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_q_proj_weight3, alloc927, model_decoder_layers_18_self_attn_q_proj_bias3, alloc928) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias3) + gv1569: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape890: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc928, gv1569, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc928) + model_decoder_layers_18_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] + gv1570: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc929: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1570, R.dtype("float16")) + _927: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_18_self_attn_k_proj_weight3, alloc927, alloc929) + R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight3) + gv1571: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape891: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc929, gv1571, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc929) + model_decoder_layers_18_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] + model_decoder_layers_18_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[923] + gv1572: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc930: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1572, R.dtype("float16")) + _928: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_v_proj_weight3, alloc927, model_decoder_layers_18_self_attn_v_proj_bias3, alloc930) + R.vm.kill_object(alloc927) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias3) + gv1573: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape892: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc930, gv1573, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc930) + gv1574: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc931: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1574, R.dtype("float16")) + cls.concatenate(reshape890, reshape891, reshape892, alloc931) + R.vm.kill_object(reshape890) + R.vm.kill_object(reshape891) + R.vm.kill_object(reshape892) + gv1575: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape893: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc931, gv1575, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc931) + gv1576: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc932: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1576, R.dtype("float16")) + _930: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape893, alloc932) + R.vm.kill_object(reshape893) + gv1577: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape894: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc932, gv1577, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc932) + gv1578: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape895: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape894, gv1578, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape894) + model_decoder_layers_18_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] + model_decoder_layers_18_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[927] + gv1579: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc933: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1579, R.dtype("float16")) + _931: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_self_attn_out_proj_weight3, reshape895, model_decoder_layers_18_self_attn_out_proj_bias3, alloc933) + R.vm.kill_object(reshape895) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias3) + gv1580: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc934: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1580, R.dtype("float16")) + cls.add(alloc926, alloc933, alloc934) + R.vm.kill_object(alloc926) + R.vm.kill_object(alloc933) + model_decoder_layers_18_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[937] + model_decoder_layers_18_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[938] + gv1581: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc935: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1581, R.dtype("float16")) + cls.layer_norm(alloc934, model_decoder_layers_18_encoder_attn_layer_norm_weight3, model_decoder_layers_18_encoder_attn_layer_norm_bias3, alloc935) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias3) + model_decoder_layers_18_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] + model_decoder_layers_18_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[934] + gv1582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc936: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1582, R.dtype("float16")) + _934: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight3, alloc935, model_decoder_layers_18_encoder_attn_q_proj_bias3, alloc936) + R.vm.kill_object(alloc935) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias3) + gv1583: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape896: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc936, gv1583, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc936) + gv1584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape897: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape896, gv1584, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape896) + gv1585: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc937: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1585, R.dtype("float16")) + _935: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape897, alloc937) + R.vm.kill_object(reshape897) + gv1586: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape898: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc937, gv1586, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc937) + gv1587: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape899: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape898, gv1587, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape898) + model_decoder_layers_18_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] + model_decoder_layers_18_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[936] + gv1588: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc938: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1588, R.dtype("float16")) + _936: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight3, reshape899, model_decoder_layers_18_encoder_attn_out_proj_bias3, alloc938) + R.vm.kill_object(reshape899) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias3) + gv1589: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc939: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1589, R.dtype("float16")) + cls.add(alloc934, alloc938, alloc939) + R.vm.kill_object(alloc934) + R.vm.kill_object(alloc938) + model_decoder_layers_18_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[943] + model_decoder_layers_18_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[944] + gv1590: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc940: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1590, R.dtype("float16")) + cls.layer_norm(alloc939, model_decoder_layers_18_final_layer_norm_weight3, model_decoder_layers_18_final_layer_norm_bias3, alloc940) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias3) + model_decoder_layers_18_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] + model_decoder_layers_18_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[940] + gv1591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc941: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1591, R.dtype("float16")) + _939: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_18_fc1_weight3, alloc940, model_decoder_layers_18_fc1_bias3, alloc941) + R.vm.kill_object(alloc940) + R.vm.kill_object(model_decoder_layers_18_fc1_weight3) + R.vm.kill_object(model_decoder_layers_18_fc1_bias3) + model_decoder_layers_18_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] + model_decoder_layers_18_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[942] + gv1592: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc942: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1592, R.dtype("float16")) + _940: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_18_fc2_weight3, alloc941, model_decoder_layers_18_fc2_bias3, alloc942) + R.vm.kill_object(alloc941) + R.vm.kill_object(model_decoder_layers_18_fc2_weight3) + R.vm.kill_object(model_decoder_layers_18_fc2_bias3) + gv1593: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc943: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1593, R.dtype("float16")) + cls.add(alloc939, alloc942, alloc943) + R.vm.kill_object(alloc939) + R.vm.kill_object(alloc942) + model_decoder_layers_19_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[952] + model_decoder_layers_19_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[953] + gv1594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc944: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1594, R.dtype("float16")) + cls.layer_norm(alloc943, model_decoder_layers_19_self_attn_layer_norm_weight3, model_decoder_layers_19_self_attn_layer_norm_bias3, alloc944) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias3) + model_decoder_layers_19_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] + model_decoder_layers_19_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[949] + gv1595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc945: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1595, R.dtype("float16")) + _943: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_q_proj_weight3, alloc944, model_decoder_layers_19_self_attn_q_proj_bias3, alloc945) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias3) + gv1596: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape900: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc945, gv1596, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc945) + model_decoder_layers_19_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] + gv1597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc946: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1597, R.dtype("float16")) + _944: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_19_self_attn_k_proj_weight3, alloc944, alloc946) + R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight3) + gv1598: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape901: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc946, gv1598, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc946) + model_decoder_layers_19_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] + model_decoder_layers_19_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[947] + gv1599: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc947: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1599, R.dtype("float16")) + _945: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_v_proj_weight3, alloc944, model_decoder_layers_19_self_attn_v_proj_bias3, alloc947) + R.vm.kill_object(alloc944) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias3) + gv1600: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape902: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc947, gv1600, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc947) + gv1601: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc948: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1601, R.dtype("float16")) + cls.concatenate(reshape900, reshape901, reshape902, alloc948) + R.vm.kill_object(reshape900) + R.vm.kill_object(reshape901) + R.vm.kill_object(reshape902) + gv1602: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape903: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc948, gv1602, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc948) + gv1603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc949: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1603, R.dtype("float16")) + _947: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape903, alloc949) + R.vm.kill_object(reshape903) + gv1604: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape904: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc949, gv1604, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc949) + gv1605: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape905: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape904, gv1605, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape904) + model_decoder_layers_19_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] + model_decoder_layers_19_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[951] + gv1606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc950: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1606, R.dtype("float16")) + _948: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_self_attn_out_proj_weight3, reshape905, model_decoder_layers_19_self_attn_out_proj_bias3, alloc950) + R.vm.kill_object(reshape905) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias3) + gv1607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc951: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1607, R.dtype("float16")) + cls.add(alloc943, alloc950, alloc951) + R.vm.kill_object(alloc943) + R.vm.kill_object(alloc950) + model_decoder_layers_19_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[961] + model_decoder_layers_19_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[962] + gv1608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc952: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1608, R.dtype("float16")) + cls.layer_norm(alloc951, model_decoder_layers_19_encoder_attn_layer_norm_weight3, model_decoder_layers_19_encoder_attn_layer_norm_bias3, alloc952) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias3) + model_decoder_layers_19_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] + model_decoder_layers_19_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[958] + gv1609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc953: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1609, R.dtype("float16")) + _951: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight3, alloc952, model_decoder_layers_19_encoder_attn_q_proj_bias3, alloc953) + R.vm.kill_object(alloc952) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias3) + gv1610: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape906: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc953, gv1610, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc953) + gv1611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape907: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape906, gv1611, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape906) + gv1612: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc954: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1612, R.dtype("float16")) + _952: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape907, alloc954) + R.vm.kill_object(reshape907) + gv1613: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape908: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc954, gv1613, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc954) + gv1614: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape909: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape908, gv1614, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape908) + model_decoder_layers_19_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] + model_decoder_layers_19_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[960] + gv1615: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc955: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1615, R.dtype("float16")) + _953: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight3, reshape909, model_decoder_layers_19_encoder_attn_out_proj_bias3, alloc955) + R.vm.kill_object(reshape909) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias3) + gv1616: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc956: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1616, R.dtype("float16")) + cls.add(alloc951, alloc955, alloc956) + R.vm.kill_object(alloc951) + R.vm.kill_object(alloc955) + model_decoder_layers_19_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[967] + model_decoder_layers_19_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[968] + gv1617: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc957: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1617, R.dtype("float16")) + cls.layer_norm(alloc956, model_decoder_layers_19_final_layer_norm_weight3, model_decoder_layers_19_final_layer_norm_bias3, alloc957) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias3) + model_decoder_layers_19_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] + model_decoder_layers_19_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[964] + gv1618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc958: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1618, R.dtype("float16")) + _956: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_19_fc1_weight3, alloc957, model_decoder_layers_19_fc1_bias3, alloc958) + R.vm.kill_object(alloc957) + R.vm.kill_object(model_decoder_layers_19_fc1_weight3) + R.vm.kill_object(model_decoder_layers_19_fc1_bias3) + model_decoder_layers_19_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] + model_decoder_layers_19_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[966] + gv1619: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc959: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1619, R.dtype("float16")) + _957: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_19_fc2_weight3, alloc958, model_decoder_layers_19_fc2_bias3, alloc959) + R.vm.kill_object(alloc958) + R.vm.kill_object(model_decoder_layers_19_fc2_weight3) + R.vm.kill_object(model_decoder_layers_19_fc2_bias3) + gv1620: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc960: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1620, R.dtype("float16")) + cls.add(alloc956, alloc959, alloc960) + R.vm.kill_object(alloc956) + R.vm.kill_object(alloc959) + model_decoder_layers_20_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[976] + model_decoder_layers_20_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[977] + gv1621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc961: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1621, R.dtype("float16")) + cls.layer_norm(alloc960, model_decoder_layers_20_self_attn_layer_norm_weight3, model_decoder_layers_20_self_attn_layer_norm_bias3, alloc961) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias3) + model_decoder_layers_20_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] + model_decoder_layers_20_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[973] + gv1622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc962: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1622, R.dtype("float16")) + _960: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_q_proj_weight3, alloc961, model_decoder_layers_20_self_attn_q_proj_bias3, alloc962) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias3) + gv1623: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape910: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc962, gv1623, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc962) + model_decoder_layers_20_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] + gv1624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc963: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1624, R.dtype("float16")) + _961: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_20_self_attn_k_proj_weight3, alloc961, alloc963) + R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight3) + gv1625: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape911: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc963, gv1625, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc963) + model_decoder_layers_20_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] + model_decoder_layers_20_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[971] + gv1626: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc964: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1626, R.dtype("float16")) + _962: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_v_proj_weight3, alloc961, model_decoder_layers_20_self_attn_v_proj_bias3, alloc964) + R.vm.kill_object(alloc961) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias3) + gv1627: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape912: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc964, gv1627, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc964) + gv1628: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc965: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1628, R.dtype("float16")) + cls.concatenate(reshape910, reshape911, reshape912, alloc965) + R.vm.kill_object(reshape910) + R.vm.kill_object(reshape911) + R.vm.kill_object(reshape912) + gv1629: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape913: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc965, gv1629, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc965) + gv1630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc966: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1630, R.dtype("float16")) + _964: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape913, alloc966) + R.vm.kill_object(reshape913) + gv1631: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape914: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc966, gv1631, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc966) + gv1632: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape915: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape914, gv1632, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape914) + model_decoder_layers_20_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] + model_decoder_layers_20_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[975] + gv1633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc967: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1633, R.dtype("float16")) + _965: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_self_attn_out_proj_weight3, reshape915, model_decoder_layers_20_self_attn_out_proj_bias3, alloc967) + R.vm.kill_object(reshape915) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias3) + gv1634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc968: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1634, R.dtype("float16")) + cls.add(alloc960, alloc967, alloc968) + R.vm.kill_object(alloc960) + R.vm.kill_object(alloc967) + model_decoder_layers_20_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[985] + model_decoder_layers_20_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[986] + gv1635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc969: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1635, R.dtype("float16")) + cls.layer_norm(alloc968, model_decoder_layers_20_encoder_attn_layer_norm_weight3, model_decoder_layers_20_encoder_attn_layer_norm_bias3, alloc969) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias3) + model_decoder_layers_20_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] + model_decoder_layers_20_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[982] + gv1636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc970: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1636, R.dtype("float16")) + _968: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight3, alloc969, model_decoder_layers_20_encoder_attn_q_proj_bias3, alloc970) + R.vm.kill_object(alloc969) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias3) + gv1637: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape916: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc970, gv1637, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc970) + gv1638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape917: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape916, gv1638, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape916) + gv1639: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc971: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1639, R.dtype("float16")) + _969: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape917, alloc971) + R.vm.kill_object(reshape917) + gv1640: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape918: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc971, gv1640, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc971) + gv1641: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape919: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape918, gv1641, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape918) + model_decoder_layers_20_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] + model_decoder_layers_20_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[984] + gv1642: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc972: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1642, R.dtype("float16")) + _970: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight3, reshape919, model_decoder_layers_20_encoder_attn_out_proj_bias3, alloc972) + R.vm.kill_object(reshape919) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias3) + gv1643: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc973: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1643, R.dtype("float16")) + cls.add(alloc968, alloc972, alloc973) + R.vm.kill_object(alloc968) + R.vm.kill_object(alloc972) + model_decoder_layers_20_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[991] + model_decoder_layers_20_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[992] + gv1644: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc974: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1644, R.dtype("float16")) + cls.layer_norm(alloc973, model_decoder_layers_20_final_layer_norm_weight3, model_decoder_layers_20_final_layer_norm_bias3, alloc974) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias3) + model_decoder_layers_20_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] + model_decoder_layers_20_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[988] + gv1645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc975: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1645, R.dtype("float16")) + _973: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_20_fc1_weight3, alloc974, model_decoder_layers_20_fc1_bias3, alloc975) + R.vm.kill_object(alloc974) + R.vm.kill_object(model_decoder_layers_20_fc1_weight3) + R.vm.kill_object(model_decoder_layers_20_fc1_bias3) + model_decoder_layers_20_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] + model_decoder_layers_20_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[990] + gv1646: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc976: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1646, R.dtype("float16")) + _974: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_20_fc2_weight3, alloc975, model_decoder_layers_20_fc2_bias3, alloc976) + R.vm.kill_object(alloc975) + R.vm.kill_object(model_decoder_layers_20_fc2_weight3) + R.vm.kill_object(model_decoder_layers_20_fc2_bias3) + gv1647: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc977: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1647, R.dtype("float16")) + cls.add(alloc973, alloc976, alloc977) + R.vm.kill_object(alloc973) + R.vm.kill_object(alloc976) + model_decoder_layers_21_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1000] + model_decoder_layers_21_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1001] + gv1648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc978: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1648, R.dtype("float16")) + cls.layer_norm(alloc977, model_decoder_layers_21_self_attn_layer_norm_weight3, model_decoder_layers_21_self_attn_layer_norm_bias3, alloc978) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias3) + model_decoder_layers_21_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] + model_decoder_layers_21_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[997] + gv1649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc979: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1649, R.dtype("float16")) + _977: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_q_proj_weight3, alloc978, model_decoder_layers_21_self_attn_q_proj_bias3, alloc979) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias3) + gv1650: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape920: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc979, gv1650, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc979) + model_decoder_layers_21_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] + gv1651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc980: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1651, R.dtype("float16")) + _978: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_21_self_attn_k_proj_weight3, alloc978, alloc980) + R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight3) + gv1652: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape921: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc980, gv1652, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc980) + model_decoder_layers_21_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] + model_decoder_layers_21_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[995] + gv1653: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc981: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1653, R.dtype("float16")) + _979: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_v_proj_weight3, alloc978, model_decoder_layers_21_self_attn_v_proj_bias3, alloc981) + R.vm.kill_object(alloc978) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias3) + gv1654: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape922: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc981, gv1654, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc981) + gv1655: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc982: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1655, R.dtype("float16")) + cls.concatenate(reshape920, reshape921, reshape922, alloc982) + R.vm.kill_object(reshape920) + R.vm.kill_object(reshape921) + R.vm.kill_object(reshape922) + gv1656: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape923: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc982, gv1656, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc982) + gv1657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc983: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1657, R.dtype("float16")) + _981: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape923, alloc983) + R.vm.kill_object(reshape923) + gv1658: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape924: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc983, gv1658, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc983) + gv1659: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape925: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape924, gv1659, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape924) + model_decoder_layers_21_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] + model_decoder_layers_21_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[999] + gv1660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc984: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1660, R.dtype("float16")) + _982: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_self_attn_out_proj_weight3, reshape925, model_decoder_layers_21_self_attn_out_proj_bias3, alloc984) + R.vm.kill_object(reshape925) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias3) + gv1661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc985: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1661, R.dtype("float16")) + cls.add(alloc977, alloc984, alloc985) + R.vm.kill_object(alloc977) + R.vm.kill_object(alloc984) + model_decoder_layers_21_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1009] + model_decoder_layers_21_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1010] + gv1662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc986: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1662, R.dtype("float16")) + cls.layer_norm(alloc985, model_decoder_layers_21_encoder_attn_layer_norm_weight3, model_decoder_layers_21_encoder_attn_layer_norm_bias3, alloc986) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias3) + model_decoder_layers_21_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] + model_decoder_layers_21_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1006] + gv1663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc987: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1663, R.dtype("float16")) + _985: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight3, alloc986, model_decoder_layers_21_encoder_attn_q_proj_bias3, alloc987) + R.vm.kill_object(alloc986) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias3) + gv1664: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape926: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc987, gv1664, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc987) + gv1665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape927: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape926, gv1665, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape926) + gv1666: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc988: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1666, R.dtype("float16")) + _986: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape927, alloc988) + R.vm.kill_object(reshape927) + gv1667: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape928: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc988, gv1667, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc988) + gv1668: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape929: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape928, gv1668, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape928) + model_decoder_layers_21_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] + model_decoder_layers_21_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1008] + gv1669: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc989: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1669, R.dtype("float16")) + _987: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight3, reshape929, model_decoder_layers_21_encoder_attn_out_proj_bias3, alloc989) + R.vm.kill_object(reshape929) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias3) + gv1670: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc990: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1670, R.dtype("float16")) + cls.add(alloc985, alloc989, alloc990) + R.vm.kill_object(alloc985) + R.vm.kill_object(alloc989) + model_decoder_layers_21_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1015] + model_decoder_layers_21_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1016] + gv1671: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc991: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1671, R.dtype("float16")) + cls.layer_norm(alloc990, model_decoder_layers_21_final_layer_norm_weight3, model_decoder_layers_21_final_layer_norm_bias3, alloc991) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias3) + model_decoder_layers_21_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] + model_decoder_layers_21_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1012] + gv1672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc992: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1672, R.dtype("float16")) + _990: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_21_fc1_weight3, alloc991, model_decoder_layers_21_fc1_bias3, alloc992) + R.vm.kill_object(alloc991) + R.vm.kill_object(model_decoder_layers_21_fc1_weight3) + R.vm.kill_object(model_decoder_layers_21_fc1_bias3) + model_decoder_layers_21_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] + model_decoder_layers_21_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1014] + gv1673: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc993: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1673, R.dtype("float16")) + _991: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_21_fc2_weight3, alloc992, model_decoder_layers_21_fc2_bias3, alloc993) + R.vm.kill_object(alloc992) + R.vm.kill_object(model_decoder_layers_21_fc2_weight3) + R.vm.kill_object(model_decoder_layers_21_fc2_bias3) + gv1674: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc994: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1674, R.dtype("float16")) + cls.add(alloc990, alloc993, alloc994) + R.vm.kill_object(alloc990) + R.vm.kill_object(alloc993) + model_decoder_layers_22_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1024] + model_decoder_layers_22_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1025] + gv1675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc995: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1675, R.dtype("float16")) + cls.layer_norm(alloc994, model_decoder_layers_22_self_attn_layer_norm_weight3, model_decoder_layers_22_self_attn_layer_norm_bias3, alloc995) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias3) + model_decoder_layers_22_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] + model_decoder_layers_22_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1021] + gv1676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc996: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1676, R.dtype("float16")) + _994: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_q_proj_weight3, alloc995, model_decoder_layers_22_self_attn_q_proj_bias3, alloc996) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias3) + gv1677: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape930: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc996, gv1677, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc996) + model_decoder_layers_22_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] + gv1678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc997: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1678, R.dtype("float16")) + _995: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_22_self_attn_k_proj_weight3, alloc995, alloc997) + R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight3) + gv1679: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape931: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc997, gv1679, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc997) + model_decoder_layers_22_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] + model_decoder_layers_22_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1019] + gv1680: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc998: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1680, R.dtype("float16")) + _996: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_v_proj_weight3, alloc995, model_decoder_layers_22_self_attn_v_proj_bias3, alloc998) + R.vm.kill_object(alloc995) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias3) + gv1681: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape932: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc998, gv1681, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc998) + gv1682: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc999: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1682, R.dtype("float16")) + cls.concatenate(reshape930, reshape931, reshape932, alloc999) + R.vm.kill_object(reshape930) + R.vm.kill_object(reshape931) + R.vm.kill_object(reshape932) + gv1683: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape933: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc999, gv1683, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc999) + gv1684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1000: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1684, R.dtype("float16")) + _998: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape933, alloc1000) + R.vm.kill_object(reshape933) + gv1685: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape934: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1000, gv1685, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1000) + gv1686: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape935: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape934, gv1686, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape934) + model_decoder_layers_22_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] + model_decoder_layers_22_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1023] + gv1687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1001: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1687, R.dtype("float16")) + _999: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_self_attn_out_proj_weight3, reshape935, model_decoder_layers_22_self_attn_out_proj_bias3, alloc1001) + R.vm.kill_object(reshape935) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias3) + gv1688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1002: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1688, R.dtype("float16")) + cls.add(alloc994, alloc1001, alloc1002) + R.vm.kill_object(alloc994) + R.vm.kill_object(alloc1001) + model_decoder_layers_22_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1033] + model_decoder_layers_22_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1034] + gv1689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1003: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1689, R.dtype("float16")) + cls.layer_norm(alloc1002, model_decoder_layers_22_encoder_attn_layer_norm_weight3, model_decoder_layers_22_encoder_attn_layer_norm_bias3, alloc1003) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias3) + model_decoder_layers_22_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] + model_decoder_layers_22_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1030] + gv1690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1004: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1690, R.dtype("float16")) + _1002: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight3, alloc1003, model_decoder_layers_22_encoder_attn_q_proj_bias3, alloc1004) + R.vm.kill_object(alloc1003) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias3) + gv1691: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape936: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1004, gv1691, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1004) + gv1692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape937: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape936, gv1692, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape936) + gv1693: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1005: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1693, R.dtype("float16")) + _1003: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape937, alloc1005) + R.vm.kill_object(reshape937) + gv1694: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape938: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1005, gv1694, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1005) + gv1695: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape939: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape938, gv1695, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape938) + model_decoder_layers_22_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] + model_decoder_layers_22_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1032] + gv1696: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1006: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1696, R.dtype("float16")) + _1004: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight3, reshape939, model_decoder_layers_22_encoder_attn_out_proj_bias3, alloc1006) + R.vm.kill_object(reshape939) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias3) + gv1697: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1007: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1697, R.dtype("float16")) + cls.add(alloc1002, alloc1006, alloc1007) + R.vm.kill_object(alloc1002) + R.vm.kill_object(alloc1006) + model_decoder_layers_22_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1039] + model_decoder_layers_22_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1040] + gv1698: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1008: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1698, R.dtype("float16")) + cls.layer_norm(alloc1007, model_decoder_layers_22_final_layer_norm_weight3, model_decoder_layers_22_final_layer_norm_bias3, alloc1008) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias3) + model_decoder_layers_22_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] + model_decoder_layers_22_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1036] + gv1699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1009: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1699, R.dtype("float16")) + _1007: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_22_fc1_weight3, alloc1008, model_decoder_layers_22_fc1_bias3, alloc1009) + R.vm.kill_object(alloc1008) + R.vm.kill_object(model_decoder_layers_22_fc1_weight3) + R.vm.kill_object(model_decoder_layers_22_fc1_bias3) + model_decoder_layers_22_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] + model_decoder_layers_22_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1038] + gv1700: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1010: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1700, R.dtype("float16")) + _1008: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_22_fc2_weight3, alloc1009, model_decoder_layers_22_fc2_bias3, alloc1010) + R.vm.kill_object(alloc1009) + R.vm.kill_object(model_decoder_layers_22_fc2_weight3) + R.vm.kill_object(model_decoder_layers_22_fc2_bias3) + gv1701: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1011: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1701, R.dtype("float16")) + cls.add(alloc1007, alloc1010, alloc1011) + R.vm.kill_object(alloc1007) + R.vm.kill_object(alloc1010) + model_decoder_layers_23_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1048] + model_decoder_layers_23_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1049] + gv1702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1012: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1702, R.dtype("float16")) + cls.layer_norm(alloc1011, model_decoder_layers_23_self_attn_layer_norm_weight3, model_decoder_layers_23_self_attn_layer_norm_bias3, alloc1012) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias3) + model_decoder_layers_23_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] + model_decoder_layers_23_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1045] + gv1703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1013: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1703, R.dtype("float16")) + _1011: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_q_proj_weight3, alloc1012, model_decoder_layers_23_self_attn_q_proj_bias3, alloc1013) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias3) + gv1704: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape940: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1013, gv1704, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1013) + model_decoder_layers_23_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] + gv1705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1014: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1705, R.dtype("float16")) + _1012: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_23_self_attn_k_proj_weight3, alloc1012, alloc1014) + R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight3) + gv1706: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape941: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1014, gv1706, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1014) + model_decoder_layers_23_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] + model_decoder_layers_23_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1043] + gv1707: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1015: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1707, R.dtype("float16")) + _1013: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_v_proj_weight3, alloc1012, model_decoder_layers_23_self_attn_v_proj_bias3, alloc1015) + R.vm.kill_object(alloc1012) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias3) + gv1708: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape942: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1015, gv1708, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1015) + gv1709: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1016: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1709, R.dtype("float16")) + cls.concatenate(reshape940, reshape941, reshape942, alloc1016) + R.vm.kill_object(reshape940) + R.vm.kill_object(reshape941) + R.vm.kill_object(reshape942) + gv1710: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape943: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1016, gv1710, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1016) + gv1711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1017: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1711, R.dtype("float16")) + _1015: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape943, alloc1017) + R.vm.kill_object(reshape943) + gv1712: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape944: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1017, gv1712, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1017) + gv1713: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape945: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape944, gv1713, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape944) + model_decoder_layers_23_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] + model_decoder_layers_23_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1047] + gv1714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1018: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1714, R.dtype("float16")) + _1016: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_self_attn_out_proj_weight3, reshape945, model_decoder_layers_23_self_attn_out_proj_bias3, alloc1018) + R.vm.kill_object(reshape945) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias3) + gv1715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1019: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1715, R.dtype("float16")) + cls.add(alloc1011, alloc1018, alloc1019) + R.vm.kill_object(alloc1011) + R.vm.kill_object(alloc1018) + model_decoder_layers_23_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1057] + model_decoder_layers_23_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1058] + gv1716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1020: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1716, R.dtype("float16")) + cls.layer_norm(alloc1019, model_decoder_layers_23_encoder_attn_layer_norm_weight3, model_decoder_layers_23_encoder_attn_layer_norm_bias3, alloc1020) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias3) + model_decoder_layers_23_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] + model_decoder_layers_23_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1054] + gv1717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1021: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1717, R.dtype("float16")) + _1019: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight3, alloc1020, model_decoder_layers_23_encoder_attn_q_proj_bias3, alloc1021) + R.vm.kill_object(alloc1020) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias3) + gv1718: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape946: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1021, gv1718, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1021) + gv1719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape947: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape946, gv1719, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape946) + gv1720: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1022: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1720, R.dtype("float16")) + _1020: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape947, alloc1022) + R.vm.kill_object(reshape947) + gv1721: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape948: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1022, gv1721, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1022) + gv1722: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape949: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape948, gv1722, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape948) + model_decoder_layers_23_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] + model_decoder_layers_23_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1056] + gv1723: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1023: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1723, R.dtype("float16")) + _1021: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight3, reshape949, model_decoder_layers_23_encoder_attn_out_proj_bias3, alloc1023) + R.vm.kill_object(reshape949) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias3) + gv1724: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1024: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1724, R.dtype("float16")) + cls.add(alloc1019, alloc1023, alloc1024) + R.vm.kill_object(alloc1019) + R.vm.kill_object(alloc1023) + model_decoder_layers_23_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1063] + model_decoder_layers_23_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1064] + gv1725: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1025: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1725, R.dtype("float16")) + cls.layer_norm(alloc1024, model_decoder_layers_23_final_layer_norm_weight3, model_decoder_layers_23_final_layer_norm_bias3, alloc1025) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias3) + model_decoder_layers_23_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] + model_decoder_layers_23_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1060] + gv1726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1026: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1726, R.dtype("float16")) + _1024: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_23_fc1_weight3, alloc1025, model_decoder_layers_23_fc1_bias3, alloc1026) + R.vm.kill_object(alloc1025) + R.vm.kill_object(model_decoder_layers_23_fc1_weight3) + R.vm.kill_object(model_decoder_layers_23_fc1_bias3) + model_decoder_layers_23_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] + model_decoder_layers_23_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1062] + gv1727: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1027: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1727, R.dtype("float16")) + _1025: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_23_fc2_weight3, alloc1026, model_decoder_layers_23_fc2_bias3, alloc1027) + R.vm.kill_object(alloc1026) + R.vm.kill_object(model_decoder_layers_23_fc2_weight3) + R.vm.kill_object(model_decoder_layers_23_fc2_bias3) + gv1728: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1028: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1728, R.dtype("float16")) + cls.add(alloc1024, alloc1027, alloc1028) + R.vm.kill_object(alloc1024) + R.vm.kill_object(alloc1027) + model_decoder_layers_24_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1072] + model_decoder_layers_24_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1073] + gv1729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1029: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1729, R.dtype("float16")) + cls.layer_norm(alloc1028, model_decoder_layers_24_self_attn_layer_norm_weight3, model_decoder_layers_24_self_attn_layer_norm_bias3, alloc1029) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias3) + model_decoder_layers_24_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] + model_decoder_layers_24_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1069] + gv1730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1030: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1730, R.dtype("float16")) + _1028: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_q_proj_weight3, alloc1029, model_decoder_layers_24_self_attn_q_proj_bias3, alloc1030) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias3) + gv1731: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape950: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1030, gv1731, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1030) + model_decoder_layers_24_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] + gv1732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1031: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1732, R.dtype("float16")) + _1029: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_24_self_attn_k_proj_weight3, alloc1029, alloc1031) + R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight3) + gv1733: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape951: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1031, gv1733, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1031) + model_decoder_layers_24_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] + model_decoder_layers_24_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1067] + gv1734: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1032: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1734, R.dtype("float16")) + _1030: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_v_proj_weight3, alloc1029, model_decoder_layers_24_self_attn_v_proj_bias3, alloc1032) + R.vm.kill_object(alloc1029) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias3) + gv1735: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape952: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1032, gv1735, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1032) + gv1736: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1033: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1736, R.dtype("float16")) + cls.concatenate(reshape950, reshape951, reshape952, alloc1033) + R.vm.kill_object(reshape950) + R.vm.kill_object(reshape951) + R.vm.kill_object(reshape952) + gv1737: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape953: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1033, gv1737, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1033) + gv1738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1034: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1738, R.dtype("float16")) + _1032: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape953, alloc1034) + R.vm.kill_object(reshape953) + gv1739: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape954: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1034, gv1739, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1034) + gv1740: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape955: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape954, gv1740, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape954) + model_decoder_layers_24_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] + model_decoder_layers_24_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1071] + gv1741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1035: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1741, R.dtype("float16")) + _1033: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_self_attn_out_proj_weight3, reshape955, model_decoder_layers_24_self_attn_out_proj_bias3, alloc1035) + R.vm.kill_object(reshape955) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias3) + gv1742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1036: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1742, R.dtype("float16")) + cls.add(alloc1028, alloc1035, alloc1036) + R.vm.kill_object(alloc1028) + R.vm.kill_object(alloc1035) + model_decoder_layers_24_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1081] + model_decoder_layers_24_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1082] + gv1743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1037: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1743, R.dtype("float16")) + cls.layer_norm(alloc1036, model_decoder_layers_24_encoder_attn_layer_norm_weight3, model_decoder_layers_24_encoder_attn_layer_norm_bias3, alloc1037) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias3) + model_decoder_layers_24_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] + model_decoder_layers_24_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1078] + gv1744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1038: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1744, R.dtype("float16")) + _1036: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight3, alloc1037, model_decoder_layers_24_encoder_attn_q_proj_bias3, alloc1038) + R.vm.kill_object(alloc1037) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias3) + gv1745: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape956: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1038, gv1745, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1038) + gv1746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape957: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape956, gv1746, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape956) + gv1747: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1039: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1747, R.dtype("float16")) + _1037: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape957, alloc1039) + R.vm.kill_object(reshape957) + gv1748: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape958: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1039, gv1748, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1039) + gv1749: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape959: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape958, gv1749, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape958) + model_decoder_layers_24_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] + model_decoder_layers_24_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1080] + gv1750: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1040: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1750, R.dtype("float16")) + _1038: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight3, reshape959, model_decoder_layers_24_encoder_attn_out_proj_bias3, alloc1040) + R.vm.kill_object(reshape959) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias3) + gv1751: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1041: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1751, R.dtype("float16")) + cls.add(alloc1036, alloc1040, alloc1041) + R.vm.kill_object(alloc1036) + R.vm.kill_object(alloc1040) + model_decoder_layers_24_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1087] + model_decoder_layers_24_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1088] + gv1752: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1042: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1752, R.dtype("float16")) + cls.layer_norm(alloc1041, model_decoder_layers_24_final_layer_norm_weight3, model_decoder_layers_24_final_layer_norm_bias3, alloc1042) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias3) + model_decoder_layers_24_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] + model_decoder_layers_24_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1084] + gv1753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1043: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1753, R.dtype("float16")) + _1041: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_24_fc1_weight3, alloc1042, model_decoder_layers_24_fc1_bias3, alloc1043) + R.vm.kill_object(alloc1042) + R.vm.kill_object(model_decoder_layers_24_fc1_weight3) + R.vm.kill_object(model_decoder_layers_24_fc1_bias3) + model_decoder_layers_24_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] + model_decoder_layers_24_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1086] + gv1754: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1044: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1754, R.dtype("float16")) + _1042: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_24_fc2_weight3, alloc1043, model_decoder_layers_24_fc2_bias3, alloc1044) + R.vm.kill_object(alloc1043) + R.vm.kill_object(model_decoder_layers_24_fc2_weight3) + R.vm.kill_object(model_decoder_layers_24_fc2_bias3) + gv1755: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1045: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1755, R.dtype("float16")) + cls.add(alloc1041, alloc1044, alloc1045) + R.vm.kill_object(alloc1041) + R.vm.kill_object(alloc1044) + model_decoder_layers_25_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1096] + model_decoder_layers_25_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1097] + gv1756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1046: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1756, R.dtype("float16")) + cls.layer_norm(alloc1045, model_decoder_layers_25_self_attn_layer_norm_weight3, model_decoder_layers_25_self_attn_layer_norm_bias3, alloc1046) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias3) + model_decoder_layers_25_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] + model_decoder_layers_25_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1093] + gv1757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1047: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1757, R.dtype("float16")) + _1045: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_q_proj_weight3, alloc1046, model_decoder_layers_25_self_attn_q_proj_bias3, alloc1047) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias3) + gv1758: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape960: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1047, gv1758, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1047) + model_decoder_layers_25_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] + gv1759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1048: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1759, R.dtype("float16")) + _1046: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_25_self_attn_k_proj_weight3, alloc1046, alloc1048) + R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight3) + gv1760: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape961: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1048, gv1760, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1048) + model_decoder_layers_25_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] + model_decoder_layers_25_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1091] + gv1761: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1049: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1761, R.dtype("float16")) + _1047: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_v_proj_weight3, alloc1046, model_decoder_layers_25_self_attn_v_proj_bias3, alloc1049) + R.vm.kill_object(alloc1046) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias3) + gv1762: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape962: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1049, gv1762, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1049) + gv1763: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1050: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1763, R.dtype("float16")) + cls.concatenate(reshape960, reshape961, reshape962, alloc1050) + R.vm.kill_object(reshape960) + R.vm.kill_object(reshape961) + R.vm.kill_object(reshape962) + gv1764: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape963: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1050, gv1764, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1050) + gv1765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1051: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1765, R.dtype("float16")) + _1049: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape963, alloc1051) + R.vm.kill_object(reshape963) + gv1766: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape964: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1051, gv1766, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1051) + gv1767: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape965: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape964, gv1767, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape964) + model_decoder_layers_25_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] + model_decoder_layers_25_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1095] + gv1768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1052: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1768, R.dtype("float16")) + _1050: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_self_attn_out_proj_weight3, reshape965, model_decoder_layers_25_self_attn_out_proj_bias3, alloc1052) + R.vm.kill_object(reshape965) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias3) + gv1769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1053: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1769, R.dtype("float16")) + cls.add(alloc1045, alloc1052, alloc1053) + R.vm.kill_object(alloc1045) + R.vm.kill_object(alloc1052) + model_decoder_layers_25_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1105] + model_decoder_layers_25_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1106] + gv1770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1054: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1770, R.dtype("float16")) + cls.layer_norm(alloc1053, model_decoder_layers_25_encoder_attn_layer_norm_weight3, model_decoder_layers_25_encoder_attn_layer_norm_bias3, alloc1054) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias3) + model_decoder_layers_25_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] + model_decoder_layers_25_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1102] + gv1771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1055: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1771, R.dtype("float16")) + _1053: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight3, alloc1054, model_decoder_layers_25_encoder_attn_q_proj_bias3, alloc1055) + R.vm.kill_object(alloc1054) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias3) + gv1772: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape966: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1055, gv1772, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1055) + gv1773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape967: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape966, gv1773, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape966) + gv1774: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1056: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1774, R.dtype("float16")) + _1054: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape967, alloc1056) + R.vm.kill_object(reshape967) + gv1775: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape968: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1056, gv1775, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1056) + gv1776: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape969: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape968, gv1776, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape968) + model_decoder_layers_25_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] + model_decoder_layers_25_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1104] + gv1777: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1057: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1777, R.dtype("float16")) + _1055: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight3, reshape969, model_decoder_layers_25_encoder_attn_out_proj_bias3, alloc1057) + R.vm.kill_object(reshape969) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias3) + gv1778: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1058: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1778, R.dtype("float16")) + cls.add(alloc1053, alloc1057, alloc1058) + R.vm.kill_object(alloc1053) + R.vm.kill_object(alloc1057) + model_decoder_layers_25_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1111] + model_decoder_layers_25_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1112] + gv1779: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1059: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1779, R.dtype("float16")) + cls.layer_norm(alloc1058, model_decoder_layers_25_final_layer_norm_weight3, model_decoder_layers_25_final_layer_norm_bias3, alloc1059) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias3) + model_decoder_layers_25_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] + model_decoder_layers_25_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1108] + gv1780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1060: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1780, R.dtype("float16")) + _1058: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_25_fc1_weight3, alloc1059, model_decoder_layers_25_fc1_bias3, alloc1060) + R.vm.kill_object(alloc1059) + R.vm.kill_object(model_decoder_layers_25_fc1_weight3) + R.vm.kill_object(model_decoder_layers_25_fc1_bias3) + model_decoder_layers_25_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] + model_decoder_layers_25_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1110] + gv1781: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1061: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1781, R.dtype("float16")) + _1059: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_25_fc2_weight3, alloc1060, model_decoder_layers_25_fc2_bias3, alloc1061) + R.vm.kill_object(alloc1060) + R.vm.kill_object(model_decoder_layers_25_fc2_weight3) + R.vm.kill_object(model_decoder_layers_25_fc2_bias3) + gv1782: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1062: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1782, R.dtype("float16")) + cls.add(alloc1058, alloc1061, alloc1062) + R.vm.kill_object(alloc1058) + R.vm.kill_object(alloc1061) + model_decoder_layers_26_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1120] + model_decoder_layers_26_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1121] + gv1783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1063: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1783, R.dtype("float16")) + cls.layer_norm(alloc1062, model_decoder_layers_26_self_attn_layer_norm_weight3, model_decoder_layers_26_self_attn_layer_norm_bias3, alloc1063) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias3) + model_decoder_layers_26_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] + model_decoder_layers_26_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1117] + gv1784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1064: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1784, R.dtype("float16")) + _1062: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_q_proj_weight3, alloc1063, model_decoder_layers_26_self_attn_q_proj_bias3, alloc1064) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias3) + gv1785: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape970: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1064, gv1785, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1064) + model_decoder_layers_26_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] + gv1786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1065: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1786, R.dtype("float16")) + _1063: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_26_self_attn_k_proj_weight3, alloc1063, alloc1065) + R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight3) + gv1787: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape971: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1065, gv1787, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1065) + model_decoder_layers_26_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] + model_decoder_layers_26_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1115] + gv1788: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1066: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1788, R.dtype("float16")) + _1064: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_v_proj_weight3, alloc1063, model_decoder_layers_26_self_attn_v_proj_bias3, alloc1066) + R.vm.kill_object(alloc1063) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias3) + gv1789: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape972: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1066, gv1789, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1066) + gv1790: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1067: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1790, R.dtype("float16")) + cls.concatenate(reshape970, reshape971, reshape972, alloc1067) + R.vm.kill_object(reshape970) + R.vm.kill_object(reshape971) + R.vm.kill_object(reshape972) + gv1791: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape973: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1067, gv1791, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1067) + gv1792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1068: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1792, R.dtype("float16")) + _1066: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape973, alloc1068) + R.vm.kill_object(reshape973) + gv1793: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape974: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1068, gv1793, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1068) + gv1794: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape975: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape974, gv1794, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape974) + model_decoder_layers_26_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] + model_decoder_layers_26_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1119] + gv1795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1069: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1795, R.dtype("float16")) + _1067: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_self_attn_out_proj_weight3, reshape975, model_decoder_layers_26_self_attn_out_proj_bias3, alloc1069) + R.vm.kill_object(reshape975) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias3) + gv1796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1070: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1796, R.dtype("float16")) + cls.add(alloc1062, alloc1069, alloc1070) + R.vm.kill_object(alloc1062) + R.vm.kill_object(alloc1069) + model_decoder_layers_26_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1129] + model_decoder_layers_26_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1130] + gv1797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1071: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1797, R.dtype("float16")) + cls.layer_norm(alloc1070, model_decoder_layers_26_encoder_attn_layer_norm_weight3, model_decoder_layers_26_encoder_attn_layer_norm_bias3, alloc1071) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias3) + model_decoder_layers_26_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] + model_decoder_layers_26_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1126] + gv1798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1072: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1798, R.dtype("float16")) + _1070: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight3, alloc1071, model_decoder_layers_26_encoder_attn_q_proj_bias3, alloc1072) + R.vm.kill_object(alloc1071) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias3) + gv1799: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape976: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1072, gv1799, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1072) + gv1800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape977: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape976, gv1800, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape976) + gv1801: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1073: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1801, R.dtype("float16")) + _1071: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape977, alloc1073) + R.vm.kill_object(reshape977) + gv1802: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape978: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1073, gv1802, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1073) + gv1803: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape979: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape978, gv1803, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape978) + model_decoder_layers_26_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] + model_decoder_layers_26_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1128] + gv1804: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1074: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1804, R.dtype("float16")) + _1072: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight3, reshape979, model_decoder_layers_26_encoder_attn_out_proj_bias3, alloc1074) + R.vm.kill_object(reshape979) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias3) + gv1805: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1075: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1805, R.dtype("float16")) + cls.add(alloc1070, alloc1074, alloc1075) + R.vm.kill_object(alloc1070) + R.vm.kill_object(alloc1074) + model_decoder_layers_26_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1135] + model_decoder_layers_26_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1136] + gv1806: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1076: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1806, R.dtype("float16")) + cls.layer_norm(alloc1075, model_decoder_layers_26_final_layer_norm_weight3, model_decoder_layers_26_final_layer_norm_bias3, alloc1076) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias3) + model_decoder_layers_26_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] + model_decoder_layers_26_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1132] + gv1807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1077: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1807, R.dtype("float16")) + _1075: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_26_fc1_weight3, alloc1076, model_decoder_layers_26_fc1_bias3, alloc1077) + R.vm.kill_object(alloc1076) + R.vm.kill_object(model_decoder_layers_26_fc1_weight3) + R.vm.kill_object(model_decoder_layers_26_fc1_bias3) + model_decoder_layers_26_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] + model_decoder_layers_26_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1134] + gv1808: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1078: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1808, R.dtype("float16")) + _1076: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_26_fc2_weight3, alloc1077, model_decoder_layers_26_fc2_bias3, alloc1078) + R.vm.kill_object(alloc1077) + R.vm.kill_object(model_decoder_layers_26_fc2_weight3) + R.vm.kill_object(model_decoder_layers_26_fc2_bias3) + gv1809: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1079: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1809, R.dtype("float16")) + cls.add(alloc1075, alloc1078, alloc1079) + R.vm.kill_object(alloc1075) + R.vm.kill_object(alloc1078) + model_decoder_layers_27_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1144] + model_decoder_layers_27_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1145] + gv1810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1080: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1810, R.dtype("float16")) + cls.layer_norm(alloc1079, model_decoder_layers_27_self_attn_layer_norm_weight3, model_decoder_layers_27_self_attn_layer_norm_bias3, alloc1080) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias3) + model_decoder_layers_27_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] + model_decoder_layers_27_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1141] + gv1811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1081: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1811, R.dtype("float16")) + _1079: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_q_proj_weight3, alloc1080, model_decoder_layers_27_self_attn_q_proj_bias3, alloc1081) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias3) + gv1812: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape980: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1081, gv1812, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1081) + model_decoder_layers_27_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] + gv1813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1082: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1813, R.dtype("float16")) + _1080: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_27_self_attn_k_proj_weight3, alloc1080, alloc1082) + R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight3) + gv1814: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape981: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1082, gv1814, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1082) + model_decoder_layers_27_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] + model_decoder_layers_27_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1139] + gv1815: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1083: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1815, R.dtype("float16")) + _1081: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_v_proj_weight3, alloc1080, model_decoder_layers_27_self_attn_v_proj_bias3, alloc1083) + R.vm.kill_object(alloc1080) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias3) + gv1816: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape982: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1083, gv1816, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1083) + gv1817: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1084: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1817, R.dtype("float16")) + cls.concatenate(reshape980, reshape981, reshape982, alloc1084) + R.vm.kill_object(reshape980) + R.vm.kill_object(reshape981) + R.vm.kill_object(reshape982) + gv1818: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape983: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1084, gv1818, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1084) + gv1819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1085: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1819, R.dtype("float16")) + _1083: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape983, alloc1085) + R.vm.kill_object(reshape983) + gv1820: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape984: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1085, gv1820, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1085) + gv1821: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape985: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape984, gv1821, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape984) + model_decoder_layers_27_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] + model_decoder_layers_27_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1143] + gv1822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1086: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1822, R.dtype("float16")) + _1084: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_self_attn_out_proj_weight3, reshape985, model_decoder_layers_27_self_attn_out_proj_bias3, alloc1086) + R.vm.kill_object(reshape985) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias3) + gv1823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1087: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1823, R.dtype("float16")) + cls.add(alloc1079, alloc1086, alloc1087) + R.vm.kill_object(alloc1079) + R.vm.kill_object(alloc1086) + model_decoder_layers_27_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1153] + model_decoder_layers_27_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1154] + gv1824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1088: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1824, R.dtype("float16")) + cls.layer_norm(alloc1087, model_decoder_layers_27_encoder_attn_layer_norm_weight3, model_decoder_layers_27_encoder_attn_layer_norm_bias3, alloc1088) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias3) + model_decoder_layers_27_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] + model_decoder_layers_27_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1150] + gv1825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1089: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1825, R.dtype("float16")) + _1087: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight3, alloc1088, model_decoder_layers_27_encoder_attn_q_proj_bias3, alloc1089) + R.vm.kill_object(alloc1088) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias3) + gv1826: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape986: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1089, gv1826, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1089) + gv1827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape987: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape986, gv1827, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape986) + gv1828: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1090: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1828, R.dtype("float16")) + _1088: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape987, alloc1090) + R.vm.kill_object(reshape987) + gv1829: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape988: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1090, gv1829, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1090) + gv1830: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape989: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape988, gv1830, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape988) + model_decoder_layers_27_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] + model_decoder_layers_27_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1152] + gv1831: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1091: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1831, R.dtype("float16")) + _1089: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight3, reshape989, model_decoder_layers_27_encoder_attn_out_proj_bias3, alloc1091) + R.vm.kill_object(reshape989) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias3) + gv1832: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1092: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1832, R.dtype("float16")) + cls.add(alloc1087, alloc1091, alloc1092) + R.vm.kill_object(alloc1087) + R.vm.kill_object(alloc1091) + model_decoder_layers_27_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1159] + model_decoder_layers_27_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1160] + gv1833: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1093: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1833, R.dtype("float16")) + cls.layer_norm(alloc1092, model_decoder_layers_27_final_layer_norm_weight3, model_decoder_layers_27_final_layer_norm_bias3, alloc1093) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias3) + model_decoder_layers_27_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] + model_decoder_layers_27_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1156] + gv1834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1094: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1834, R.dtype("float16")) + _1092: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_27_fc1_weight3, alloc1093, model_decoder_layers_27_fc1_bias3, alloc1094) + R.vm.kill_object(alloc1093) + R.vm.kill_object(model_decoder_layers_27_fc1_weight3) + R.vm.kill_object(model_decoder_layers_27_fc1_bias3) + model_decoder_layers_27_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] + model_decoder_layers_27_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1158] + gv1835: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1095: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1835, R.dtype("float16")) + _1093: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_27_fc2_weight3, alloc1094, model_decoder_layers_27_fc2_bias3, alloc1095) + R.vm.kill_object(alloc1094) + R.vm.kill_object(model_decoder_layers_27_fc2_weight3) + R.vm.kill_object(model_decoder_layers_27_fc2_bias3) + gv1836: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1096: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1836, R.dtype("float16")) + cls.add(alloc1092, alloc1095, alloc1096) + R.vm.kill_object(alloc1092) + R.vm.kill_object(alloc1095) + model_decoder_layers_28_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1168] + model_decoder_layers_28_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1169] + gv1837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1097: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1837, R.dtype("float16")) + cls.layer_norm(alloc1096, model_decoder_layers_28_self_attn_layer_norm_weight3, model_decoder_layers_28_self_attn_layer_norm_bias3, alloc1097) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias3) + model_decoder_layers_28_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] + model_decoder_layers_28_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1165] + gv1838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1098: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1838, R.dtype("float16")) + _1096: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_q_proj_weight3, alloc1097, model_decoder_layers_28_self_attn_q_proj_bias3, alloc1098) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias3) + gv1839: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape990: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1098, gv1839, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1098) + model_decoder_layers_28_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] + gv1840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1099: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1840, R.dtype("float16")) + _1097: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_28_self_attn_k_proj_weight3, alloc1097, alloc1099) + R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight3) + gv1841: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape991: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1099, gv1841, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1099) + model_decoder_layers_28_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] + model_decoder_layers_28_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1163] + gv1842: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1842, R.dtype("float16")) + _1098: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_v_proj_weight3, alloc1097, model_decoder_layers_28_self_attn_v_proj_bias3, alloc1100) + R.vm.kill_object(alloc1097) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias3) + gv1843: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape992: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1100, gv1843, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1100) + gv1844: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1101: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1844, R.dtype("float16")) + cls.concatenate(reshape990, reshape991, reshape992, alloc1101) + R.vm.kill_object(reshape990) + R.vm.kill_object(reshape991) + R.vm.kill_object(reshape992) + gv1845: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape993: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1101, gv1845, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1101) + gv1846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1846, R.dtype("float16")) + _1100: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape993, alloc1102) + R.vm.kill_object(reshape993) + gv1847: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape994: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1102, gv1847, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1102) + gv1848: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape995: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape994, gv1848, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape994) + model_decoder_layers_28_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] + model_decoder_layers_28_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1167] + gv1849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1849, R.dtype("float16")) + _1101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_self_attn_out_proj_weight3, reshape995, model_decoder_layers_28_self_attn_out_proj_bias3, alloc1103) + R.vm.kill_object(reshape995) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias3) + gv1850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1850, R.dtype("float16")) + cls.add(alloc1096, alloc1103, alloc1104) + R.vm.kill_object(alloc1096) + R.vm.kill_object(alloc1103) + model_decoder_layers_28_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1177] + model_decoder_layers_28_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1178] + gv1851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1851, R.dtype("float16")) + cls.layer_norm(alloc1104, model_decoder_layers_28_encoder_attn_layer_norm_weight3, model_decoder_layers_28_encoder_attn_layer_norm_bias3, alloc1105) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias3) + model_decoder_layers_28_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] + model_decoder_layers_28_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1174] + gv1852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1852, R.dtype("float16")) + _1104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight3, alloc1105, model_decoder_layers_28_encoder_attn_q_proj_bias3, alloc1106) + R.vm.kill_object(alloc1105) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias3) + gv1853: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape996: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1106, gv1853, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1106) + gv1854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape997: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape996, gv1854, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape996) + gv1855: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1855, R.dtype("float16")) + _1105: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape997, alloc1107) + R.vm.kill_object(reshape997) + gv1856: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape998: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1107, gv1856, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1107) + gv1857: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape999: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape998, gv1857, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape998) + model_decoder_layers_28_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] + model_decoder_layers_28_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1176] + gv1858: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1108: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1858, R.dtype("float16")) + _1106: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight3, reshape999, model_decoder_layers_28_encoder_attn_out_proj_bias3, alloc1108) + R.vm.kill_object(reshape999) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias3) + gv1859: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1859, R.dtype("float16")) + cls.add(alloc1104, alloc1108, alloc1109) + R.vm.kill_object(alloc1104) + R.vm.kill_object(alloc1108) + model_decoder_layers_28_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1183] + model_decoder_layers_28_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1184] + gv1860: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1860, R.dtype("float16")) + cls.layer_norm(alloc1109, model_decoder_layers_28_final_layer_norm_weight3, model_decoder_layers_28_final_layer_norm_bias3, alloc1110) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias3) + model_decoder_layers_28_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] + model_decoder_layers_28_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1180] + gv1861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1861, R.dtype("float16")) + _1109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_28_fc1_weight3, alloc1110, model_decoder_layers_28_fc1_bias3, alloc1111) + R.vm.kill_object(alloc1110) + R.vm.kill_object(model_decoder_layers_28_fc1_weight3) + R.vm.kill_object(model_decoder_layers_28_fc1_bias3) + model_decoder_layers_28_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] + model_decoder_layers_28_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1182] + gv1862: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1862, R.dtype("float16")) + _1110: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_28_fc2_weight3, alloc1111, model_decoder_layers_28_fc2_bias3, alloc1112) + R.vm.kill_object(alloc1111) + R.vm.kill_object(model_decoder_layers_28_fc2_weight3) + R.vm.kill_object(model_decoder_layers_28_fc2_bias3) + gv1863: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1113: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1863, R.dtype("float16")) + cls.add(alloc1109, alloc1112, alloc1113) + R.vm.kill_object(alloc1109) + R.vm.kill_object(alloc1112) + model_decoder_layers_29_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1192] + model_decoder_layers_29_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1193] + gv1864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1864, R.dtype("float16")) + cls.layer_norm(alloc1113, model_decoder_layers_29_self_attn_layer_norm_weight3, model_decoder_layers_29_self_attn_layer_norm_bias3, alloc1114) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias3) + model_decoder_layers_29_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] + model_decoder_layers_29_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1189] + gv1865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1865, R.dtype("float16")) + _1113: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_q_proj_weight3, alloc1114, model_decoder_layers_29_self_attn_q_proj_bias3, alloc1115) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias3) + gv1866: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1000: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1115, gv1866, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1115) + model_decoder_layers_29_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] + gv1867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1867, R.dtype("float16")) + _1114: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_29_self_attn_k_proj_weight3, alloc1114, alloc1116) + R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight3) + gv1868: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1001: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1116, gv1868, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1116) + model_decoder_layers_29_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] + model_decoder_layers_29_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1187] + gv1869: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1869, R.dtype("float16")) + _1115: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_v_proj_weight3, alloc1114, model_decoder_layers_29_self_attn_v_proj_bias3, alloc1117) + R.vm.kill_object(alloc1114) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias3) + gv1870: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1002: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1117, gv1870, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1117) + gv1871: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1118: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1871, R.dtype("float16")) + cls.concatenate(reshape1000, reshape1001, reshape1002, alloc1118) + R.vm.kill_object(reshape1000) + R.vm.kill_object(reshape1001) + R.vm.kill_object(reshape1002) + gv1872: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1003: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1118, gv1872, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1118) + gv1873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1873, R.dtype("float16")) + _1117: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1003, alloc1119) + R.vm.kill_object(reshape1003) + gv1874: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1004: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1119, gv1874, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1119) + gv1875: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1005: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1004, gv1875, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1004) + model_decoder_layers_29_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] + model_decoder_layers_29_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1191] + gv1876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1876, R.dtype("float16")) + _1118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_self_attn_out_proj_weight3, reshape1005, model_decoder_layers_29_self_attn_out_proj_bias3, alloc1120) + R.vm.kill_object(reshape1005) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias3) + gv1877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1877, R.dtype("float16")) + cls.add(alloc1113, alloc1120, alloc1121) + R.vm.kill_object(alloc1113) + R.vm.kill_object(alloc1120) + model_decoder_layers_29_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1201] + model_decoder_layers_29_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1202] + gv1878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1878, R.dtype("float16")) + cls.layer_norm(alloc1121, model_decoder_layers_29_encoder_attn_layer_norm_weight3, model_decoder_layers_29_encoder_attn_layer_norm_bias3, alloc1122) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias3) + model_decoder_layers_29_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] + model_decoder_layers_29_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1198] + gv1879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1879, R.dtype("float16")) + _1121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight3, alloc1122, model_decoder_layers_29_encoder_attn_q_proj_bias3, alloc1123) + R.vm.kill_object(alloc1122) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias3) + gv1880: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1006: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1123, gv1880, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1123) + gv1881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1007: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1006, gv1881, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1006) + gv1882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1882, R.dtype("float16")) + _1122: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1007, alloc1124) + R.vm.kill_object(reshape1007) + gv1883: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1008: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1124, gv1883, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1124) + gv1884: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1009: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1008, gv1884, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1008) + model_decoder_layers_29_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] + model_decoder_layers_29_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1200] + gv1885: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1125: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1885, R.dtype("float16")) + _1123: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight3, reshape1009, model_decoder_layers_29_encoder_attn_out_proj_bias3, alloc1125) + R.vm.kill_object(reshape1009) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias3) + gv1886: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1886, R.dtype("float16")) + cls.add(alloc1121, alloc1125, alloc1126) + R.vm.kill_object(alloc1121) + R.vm.kill_object(alloc1125) + model_decoder_layers_29_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1207] + model_decoder_layers_29_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1208] + gv1887: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1887, R.dtype("float16")) + cls.layer_norm(alloc1126, model_decoder_layers_29_final_layer_norm_weight3, model_decoder_layers_29_final_layer_norm_bias3, alloc1127) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias3) + model_decoder_layers_29_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] + model_decoder_layers_29_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1204] + gv1888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1888, R.dtype("float16")) + _1126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_29_fc1_weight3, alloc1127, model_decoder_layers_29_fc1_bias3, alloc1128) + R.vm.kill_object(alloc1127) + R.vm.kill_object(model_decoder_layers_29_fc1_weight3) + R.vm.kill_object(model_decoder_layers_29_fc1_bias3) + model_decoder_layers_29_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] + model_decoder_layers_29_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1206] + gv1889: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1889, R.dtype("float16")) + _1127: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_29_fc2_weight3, alloc1128, model_decoder_layers_29_fc2_bias3, alloc1129) + R.vm.kill_object(alloc1128) + R.vm.kill_object(model_decoder_layers_29_fc2_weight3) + R.vm.kill_object(model_decoder_layers_29_fc2_bias3) + gv1890: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1130: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1890, R.dtype("float16")) + cls.add(alloc1126, alloc1129, alloc1130) + R.vm.kill_object(alloc1126) + R.vm.kill_object(alloc1129) + model_decoder_layers_30_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1216] + model_decoder_layers_30_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1217] + gv1891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1891, R.dtype("float16")) + cls.layer_norm(alloc1130, model_decoder_layers_30_self_attn_layer_norm_weight3, model_decoder_layers_30_self_attn_layer_norm_bias3, alloc1131) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias3) + model_decoder_layers_30_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] + model_decoder_layers_30_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1213] + gv1892: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1892, R.dtype("float16")) + _1130: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_q_proj_weight3, alloc1131, model_decoder_layers_30_self_attn_q_proj_bias3, alloc1132) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias3) + gv1893: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1010: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1132, gv1893, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1132) + model_decoder_layers_30_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] + gv1894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1894, R.dtype("float16")) + _1131: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_30_self_attn_k_proj_weight3, alloc1131, alloc1133) + R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight3) + gv1895: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1011: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1133, gv1895, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1133) + model_decoder_layers_30_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] + model_decoder_layers_30_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1211] + gv1896: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1896, R.dtype("float16")) + _1132: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_v_proj_weight3, alloc1131, model_decoder_layers_30_self_attn_v_proj_bias3, alloc1134) + R.vm.kill_object(alloc1131) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias3) + gv1897: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1012: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1134, gv1897, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1134) + gv1898: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1135: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1898, R.dtype("float16")) + cls.concatenate(reshape1010, reshape1011, reshape1012, alloc1135) + R.vm.kill_object(reshape1010) + R.vm.kill_object(reshape1011) + R.vm.kill_object(reshape1012) + gv1899: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1013: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1135, gv1899, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1135) + gv1900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1900, R.dtype("float16")) + _1134: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1013, alloc1136) + R.vm.kill_object(reshape1013) + gv1901: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1014: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1136, gv1901, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1136) + gv1902: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1015: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1014, gv1902, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1014) + model_decoder_layers_30_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] + model_decoder_layers_30_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1215] + gv1903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1903, R.dtype("float16")) + _1135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_self_attn_out_proj_weight3, reshape1015, model_decoder_layers_30_self_attn_out_proj_bias3, alloc1137) + R.vm.kill_object(reshape1015) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias3) + gv1904: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1904, R.dtype("float16")) + cls.add(alloc1130, alloc1137, alloc1138) + R.vm.kill_object(alloc1130) + R.vm.kill_object(alloc1137) + model_decoder_layers_30_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1225] + model_decoder_layers_30_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1226] + gv1905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1905, R.dtype("float16")) + cls.layer_norm(alloc1138, model_decoder_layers_30_encoder_attn_layer_norm_weight3, model_decoder_layers_30_encoder_attn_layer_norm_bias3, alloc1139) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias3) + model_decoder_layers_30_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] + model_decoder_layers_30_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1222] + gv1906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1906, R.dtype("float16")) + _1138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight3, alloc1139, model_decoder_layers_30_encoder_attn_q_proj_bias3, alloc1140) + R.vm.kill_object(alloc1139) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias3) + gv1907: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1016: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1140, gv1907, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1140) + gv1908: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1017: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1016, gv1908, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1016) + gv1909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1909, R.dtype("float16")) + _1139: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1017, alloc1141) + R.vm.kill_object(reshape1017) + gv1910: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1018: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1141, gv1910, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1141) + gv1911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1019: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1018, gv1911, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1018) + model_decoder_layers_30_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] + model_decoder_layers_30_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1224] + gv1912: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1142: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1912, R.dtype("float16")) + _1140: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight3, reshape1019, model_decoder_layers_30_encoder_attn_out_proj_bias3, alloc1142) + R.vm.kill_object(reshape1019) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias3) + gv1913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1913, R.dtype("float16")) + cls.add(alloc1138, alloc1142, alloc1143) + R.vm.kill_object(alloc1138) + R.vm.kill_object(alloc1142) + model_decoder_layers_30_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1231] + model_decoder_layers_30_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1232] + gv1914: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1914, R.dtype("float16")) + cls.layer_norm(alloc1143, model_decoder_layers_30_final_layer_norm_weight3, model_decoder_layers_30_final_layer_norm_bias3, alloc1144) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias3) + model_decoder_layers_30_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] + model_decoder_layers_30_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1228] + gv1915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1915, R.dtype("float16")) + _1143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_30_fc1_weight3, alloc1144, model_decoder_layers_30_fc1_bias3, alloc1145) + R.vm.kill_object(alloc1144) + R.vm.kill_object(model_decoder_layers_30_fc1_weight3) + R.vm.kill_object(model_decoder_layers_30_fc1_bias3) + model_decoder_layers_30_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] + model_decoder_layers_30_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1230] + gv1916: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1916, R.dtype("float16")) + _1144: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_30_fc2_weight3, alloc1145, model_decoder_layers_30_fc2_bias3, alloc1146) + R.vm.kill_object(alloc1145) + R.vm.kill_object(model_decoder_layers_30_fc2_weight3) + R.vm.kill_object(model_decoder_layers_30_fc2_bias3) + gv1917: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1147: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1917, R.dtype("float16")) + cls.add(alloc1143, alloc1146, alloc1147) + R.vm.kill_object(alloc1143) + R.vm.kill_object(alloc1146) + model_decoder_layers_31_self_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1240] + model_decoder_layers_31_self_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1241] + gv1918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1918, R.dtype("float16")) + cls.layer_norm(alloc1147, model_decoder_layers_31_self_attn_layer_norm_weight3, model_decoder_layers_31_self_attn_layer_norm_bias3, alloc1148) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias3) + model_decoder_layers_31_self_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] + model_decoder_layers_31_self_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1237] + gv1919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1919, R.dtype("float16")) + _1147: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_q_proj_weight3, alloc1148, model_decoder_layers_31_self_attn_q_proj_bias3, alloc1149) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias3) + gv1920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1020: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1149, gv1920, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1149) + model_decoder_layers_31_self_attn_k_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] + gv1921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1921, R.dtype("float16")) + _1148: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul3_cublas", model_decoder_layers_31_self_attn_k_proj_weight3, alloc1148, alloc1150) + R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight3) + gv1922: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1021: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1150, gv1922, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1150) + model_decoder_layers_31_self_attn_v_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] + model_decoder_layers_31_self_attn_v_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1235] + gv1923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1923, R.dtype("float16")) + _1149: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_v_proj_weight3, alloc1148, model_decoder_layers_31_self_attn_v_proj_bias3, alloc1151) + R.vm.kill_object(alloc1148) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight3) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias3) + gv1924: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1022: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1151, gv1924, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1151) + gv1925: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1152: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1925, R.dtype("float16")) + cls.concatenate(reshape1020, reshape1021, reshape1022, alloc1152) + R.vm.kill_object(reshape1020) + R.vm.kill_object(reshape1021) + R.vm.kill_object(reshape1022) + gv1926: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1023: R.Tensor((batch_size, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1152, gv1926, sinfo_args=(R.Tensor((batch_size, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1152) + gv1927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1927, R.dtype("float16")) + _1151: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1023, alloc1153) + R.vm.kill_object(reshape1023) + gv1928: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1024: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1153, gv1928, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1153) + gv1929: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1025: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1024, gv1929, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1024) + model_decoder_layers_31_self_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] + model_decoder_layers_31_self_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1239] + gv1930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1930, R.dtype("float16")) + _1152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_self_attn_out_proj_weight3, reshape1025, model_decoder_layers_31_self_attn_out_proj_bias3, alloc1154) + R.vm.kill_object(reshape1025) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias3) + gv1931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1931, R.dtype("float16")) + cls.add(alloc1147, alloc1154, alloc1155) + R.vm.kill_object(alloc1147) + R.vm.kill_object(alloc1154) + model_decoder_layers_31_encoder_attn_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1249] + model_decoder_layers_31_encoder_attn_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1250] + gv1932: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1932, R.dtype("float16")) + cls.layer_norm(alloc1155, model_decoder_layers_31_encoder_attn_layer_norm_weight3, model_decoder_layers_31_encoder_attn_layer_norm_bias3, alloc1156) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias3) + model_decoder_layers_31_encoder_attn_q_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] + model_decoder_layers_31_encoder_attn_q_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1246] + gv1933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1933, R.dtype("float16")) + _1155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight3, alloc1156, model_decoder_layers_31_encoder_attn_q_proj_bias3, alloc1157) + R.vm.kill_object(alloc1156) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight3) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias3) + gv1934: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1026: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1157, gv1934, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1157) + gv1935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1027: R.Tensor((batch_size, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1026, gv1935, sinfo_args=(R.Tensor((batch_size, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1026) + gv1936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1936, R.dtype("float16")) + _1156: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1027, alloc1158) + R.vm.kill_object(reshape1027) + gv1937: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1028: R.Tensor((batch_size, 1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1158, gv1937, sinfo_args=(R.Tensor((batch_size, 1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1158) + gv1938: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1029: R.Tensor((batch_size, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1028, gv1938, sinfo_args=(R.Tensor((batch_size, 1, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1028) + model_decoder_layers_31_encoder_attn_out_proj_weight3: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] + model_decoder_layers_31_encoder_attn_out_proj_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1248] + gv1939: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1159: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1939, R.dtype("float16")) + _1157: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add3_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight3, reshape1029, model_decoder_layers_31_encoder_attn_out_proj_bias3, alloc1159) + R.vm.kill_object(reshape1029) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight3) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias3) + gv1940: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage15, R.prim_value(0), gv1940, R.dtype("float16")) + R.vm.kill_object(storage15) + cls.add(alloc1155, alloc1159, alloc1160) + R.vm.kill_object(alloc1155) + R.vm.kill_object(alloc1159) + model_decoder_layers_31_final_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1255] + model_decoder_layers_31_final_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1256] + gv1941: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1941, R.dtype("float16")) + cls.layer_norm(alloc1160, model_decoder_layers_31_final_layer_norm_weight3, model_decoder_layers_31_final_layer_norm_bias3, alloc1161) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight3) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias3) + model_decoder_layers_31_fc1_weight3: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] + model_decoder_layers_31_fc1_bias3: R.Tensor((5120,), dtype="float16") = packed_params[1252] + gv1942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage13, R.prim_value(0), gv1942, R.dtype("float16")) + R.vm.kill_object(storage13) + _1160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu1_cublas", model_decoder_layers_31_fc1_weight3, alloc1161, model_decoder_layers_31_fc1_bias3, alloc1162) + R.vm.kill_object(alloc1161) + R.vm.kill_object(model_decoder_layers_31_fc1_weight3) + R.vm.kill_object(model_decoder_layers_31_fc1_bias3) + model_decoder_layers_31_fc2_weight3: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] + model_decoder_layers_31_fc2_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1254] + gv1943: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage14, R.prim_value(0), gv1943, R.dtype("float16")) + R.vm.kill_object(storage14) + _1161: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add4_cublas", model_decoder_layers_31_fc2_weight3, alloc1162, model_decoder_layers_31_fc2_bias3, alloc1163) + R.vm.kill_object(alloc1162) + R.vm.kill_object(model_decoder_layers_31_fc2_weight3) + R.vm.kill_object(model_decoder_layers_31_fc2_bias3) + gv1944: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1164: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage16, R.prim_value(0), gv1944, R.dtype("float16")) + R.vm.kill_object(storage16) + cls.add(alloc1160, alloc1163, alloc1164) + R.vm.kill_object(alloc1160) + R.vm.kill_object(alloc1163) + model_decoder_layer_norm_weight3: R.Tensor((1280,), dtype="float16") = packed_params[1257] + model_decoder_layer_norm_bias3: R.Tensor((1280,), dtype="float16") = packed_params[1258] + gv1945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage17, R.prim_value(0), gv1945, R.dtype("float16")) + R.vm.kill_object(storage17) + cls.layer_norm(alloc1164, model_decoder_layer_norm_weight3, model_decoder_layer_norm_bias3, alloc1165) + R.vm.kill_object(alloc1164) + R.vm.kill_object(model_decoder_layer_norm_weight3) + R.vm.kill_object(model_decoder_layer_norm_bias3) + storage18: R.Object = R.vm.alloc_storage(R.shape([1659712]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1946: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(51866), sinfo_args=(R.Shape(ndim=3),)) + alloc1166: R.Tensor(dtype="float32", ndim=3) = R.vm.alloc_tensor(storage18, R.prim_value(0), gv1946, R.dtype("float32")) + R.vm.kill_object(storage18) + _1164: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul4_cublas", model_decoder_embed_tokens_weight3, alloc1165, alloc1166) + R.vm.kill_object(model_decoder_embed_tokens_weight3) + R.vm.kill_object(alloc1165) + R.call_packed("vm.builtin.match_shape", alloc1166, shape_heap, R.prim_value(3), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(51866), R.str("ErrorContext(fn=batch_decode, loc=return, annotation=R.Tensor((batch_size, 1, 51866), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + return alloc1166 + + @R.function + def batch_encode(input_features: R.Tensor(("batch_size", 128, 3000), dtype="float16"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor(("batch_size", 1500, 1280), dtype="float16"): + batch_size = T.int64() + R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", input_features, R.prim_value(3), R.dtype("float16"), R.str("ErrorContext(fn=batch_encode, loc=param[0], param=input_features, annotation=R.Tensor((batch_size, 128, 3000), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_encode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", input_features, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(128), R.prim_value(0), R.prim_value(3000), R.str("ErrorContext(fn=batch_encode, loc=param[0], param=input_features, annotation=R.Tensor((batch_size, 128, 3000), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) + cls.shape_func1(shape_heap) + lv: R.Tensor((1280,), dtype="float16") = packed_params[1] + lv1: R.Tensor((1, 1280, 1), dtype="float16") = R.call_packed("vm.builtin.reshape", lv, R.shape([1, 1280, 1]), sinfo_args=(R.Tensor((1, 1280, 1), dtype="float16"),)) + R.vm.kill_object(lv) + lv2: R.Tensor((1280,), dtype="float16") = packed_params[3] + lv3: R.Tensor((1, 1280, 1), dtype="float16") = R.call_packed("vm.builtin.reshape", lv2, R.shape([1, 1280, 1]), sinfo_args=(R.Tensor((1, 1280, 1), dtype="float16"),)) + R.vm.kill_object(lv2) + model_encoder_conv1_weight: R.Tensor((1280, 128, 3), dtype="float16") = packed_params[0] + storage24: R.Object = R.vm.alloc_storage(R.shape([122880000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1947: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), R.prim_value(0), R.prim_value(3000), sinfo_args=(R.Shape(ndim=3),)) + alloc1620: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1947, R.dtype("float16")) + cls.fused_conv1d_add1_gelu(input_features, model_encoder_conv1_weight, lv1, alloc1620) + R.vm.kill_object(lv1) + R.vm.kill_object(model_encoder_conv1_weight) + model_encoder_conv2_weight: R.Tensor((1280, 1280, 3), dtype="float16") = packed_params[2] + storage25: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), R.prim_value(0), R.prim_value(1500), sinfo_args=(R.Shape(ndim=3),)) + alloc1621: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1948, R.dtype("float16")) + cls.fused_conv1d1_add2_gelu1(alloc1620, model_encoder_conv2_weight, lv3, alloc1621) + R.vm.kill_object(lv3) + R.vm.kill_object(alloc1620) + R.vm.kill_object(model_encoder_conv2_weight) + lv6: R.Tensor((1500, 1280), dtype="float16") = packed_params[4] + gv1949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1622: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1949, R.dtype("float16")) + cls.fused_transpose_add3(lv6, alloc1621, alloc1622) + R.vm.kill_object(alloc1621) + R.vm.kill_object(lv6) + model_encoder_layers_0_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[12] + model_encoder_layers_0_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[13] + gv1950: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1623: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1950, R.dtype("float16")) + cls.layer_norm1(alloc1622, model_encoder_layers_0_self_attn_layer_norm_weight, model_encoder_layers_0_self_attn_layer_norm_bias, alloc1623) + R.vm.kill_object(model_encoder_layers_0_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_0_self_attn_layer_norm_bias) + model_encoder_layers_0_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[8] + model_encoder_layers_0_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[9] + storage26: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1624: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1951, R.dtype("float16")) + _1622: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_q_proj_weight, alloc1623, model_encoder_layers_0_self_attn_q_proj_bias, alloc1624) + R.vm.kill_object(model_encoder_layers_0_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_0_self_attn_q_proj_bias) + gv1952: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1624, gv1952, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1624) + model_encoder_layers_0_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[5] + storage27: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1953: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1625: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1953, R.dtype("float16")) + _1623: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_0_self_attn_k_proj_weight, alloc1623, alloc1625) + R.vm.kill_object(model_encoder_layers_0_self_attn_k_proj_weight) + gv1954: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1625, gv1954, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1625) + model_encoder_layers_0_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[6] + model_encoder_layers_0_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[7] + storage28: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1626: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1955, R.dtype("float16")) + _1624: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_v_proj_weight, alloc1623, model_encoder_layers_0_self_attn_v_proj_bias, alloc1626) + R.vm.kill_object(alloc1623) + R.vm.kill_object(model_encoder_layers_0_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_0_self_attn_v_proj_bias) + gv1956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape2: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1626, gv1956, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1626) + gv1957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape3: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape, gv1957, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape) + gv1958: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape4: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1, gv1958, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1) + gv1959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape5: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape2, gv1959, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape2) + gv1960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1627: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1960, R.dtype("float16")) + _1625: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape3, reshape4, reshape5, alloc1627) + R.vm.kill_object(reshape3) + R.vm.kill_object(reshape4) + R.vm.kill_object(reshape5) + gv1961: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape6: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1627, gv1961, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1627) + gv1962: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape7: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape6, gv1962, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape6) + model_encoder_layers_0_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[10] + model_encoder_layers_0_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[11] + gv1963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1628: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1963, R.dtype("float16")) + _1626: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_0_self_attn_out_proj_weight, reshape7, model_encoder_layers_0_self_attn_out_proj_bias, alloc1628) + R.vm.kill_object(reshape7) + R.vm.kill_object(model_encoder_layers_0_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_0_self_attn_out_proj_bias) + gv1964: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1629: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1964, R.dtype("float16")) + cls.add4(alloc1622, alloc1628, alloc1629) + R.vm.kill_object(alloc1622) + R.vm.kill_object(alloc1628) + model_encoder_layers_0_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[18] + model_encoder_layers_0_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[19] + gv1965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1630: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1965, R.dtype("float16")) + cls.layer_norm1(alloc1629, model_encoder_layers_0_final_layer_norm_weight, model_encoder_layers_0_final_layer_norm_bias, alloc1630) + R.vm.kill_object(model_encoder_layers_0_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_0_final_layer_norm_bias) + model_encoder_layers_0_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[14] + model_encoder_layers_0_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[15] + gv1966: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1631: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1966, R.dtype("float16")) + _1629: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_0_fc1_weight, alloc1630, model_encoder_layers_0_fc1_bias, alloc1631) + R.vm.kill_object(alloc1630) + R.vm.kill_object(model_encoder_layers_0_fc1_weight) + R.vm.kill_object(model_encoder_layers_0_fc1_bias) + model_encoder_layers_0_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[16] + model_encoder_layers_0_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[17] + gv1967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1632: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1967, R.dtype("float16")) + _1630: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_0_fc2_weight, alloc1631, model_encoder_layers_0_fc2_bias, alloc1632) + R.vm.kill_object(alloc1631) + R.vm.kill_object(model_encoder_layers_0_fc2_weight) + R.vm.kill_object(model_encoder_layers_0_fc2_bias) + gv1968: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1633: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1968, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1629, alloc1632, alloc1633) + R.vm.kill_object(alloc1629) + R.vm.kill_object(alloc1632) + model_encoder_layers_1_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[27] + model_encoder_layers_1_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[28] + gv1969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1634: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1969, R.dtype("float16")) + cls.layer_norm1(alloc1633, model_encoder_layers_1_self_attn_layer_norm_weight, model_encoder_layers_1_self_attn_layer_norm_bias, alloc1634) + R.vm.kill_object(model_encoder_layers_1_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_1_self_attn_layer_norm_bias) + model_encoder_layers_1_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[23] + model_encoder_layers_1_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[24] + gv1970: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1635: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1970, R.dtype("float16")) + _1633: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_q_proj_weight, alloc1634, model_encoder_layers_1_self_attn_q_proj_bias, alloc1635) + R.vm.kill_object(model_encoder_layers_1_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_1_self_attn_q_proj_bias) + gv1971: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape8: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1635, gv1971, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1635) + model_encoder_layers_1_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[20] + gv1972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1636: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1972, R.dtype("float16")) + _1634: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_1_self_attn_k_proj_weight, alloc1634, alloc1636) + R.vm.kill_object(model_encoder_layers_1_self_attn_k_proj_weight) + gv1973: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape9: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1636, gv1973, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1636) + model_encoder_layers_1_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[21] + model_encoder_layers_1_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[22] + gv1974: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1637: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1974, R.dtype("float16")) + _1635: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_v_proj_weight, alloc1634, model_encoder_layers_1_self_attn_v_proj_bias, alloc1637) + R.vm.kill_object(alloc1634) + R.vm.kill_object(model_encoder_layers_1_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_1_self_attn_v_proj_bias) + gv1975: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape10: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1637, gv1975, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1637) + gv1976: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape11: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape8, gv1976, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape8) + gv1977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape12: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape9, gv1977, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape9) + gv1978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape13: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape10, gv1978, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape10) + gv1979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1638: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1979, R.dtype("float16")) + _1636: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape11, reshape12, reshape13, alloc1638) + R.vm.kill_object(reshape11) + R.vm.kill_object(reshape12) + R.vm.kill_object(reshape13) + gv1980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape14: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1638, gv1980, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1638) + gv1981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape15: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape14, gv1981, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape14) + model_encoder_layers_1_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[25] + model_encoder_layers_1_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[26] + gv1982: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1639: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1982, R.dtype("float16")) + _1637: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_1_self_attn_out_proj_weight, reshape15, model_encoder_layers_1_self_attn_out_proj_bias, alloc1639) + R.vm.kill_object(reshape15) + R.vm.kill_object(model_encoder_layers_1_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_1_self_attn_out_proj_bias) + gv1983: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1640: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1983, R.dtype("float16")) + cls.add4(alloc1633, alloc1639, alloc1640) + R.vm.kill_object(alloc1633) + R.vm.kill_object(alloc1639) + model_encoder_layers_1_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[33] + model_encoder_layers_1_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[34] + gv1984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1641: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1984, R.dtype("float16")) + cls.layer_norm1(alloc1640, model_encoder_layers_1_final_layer_norm_weight, model_encoder_layers_1_final_layer_norm_bias, alloc1641) + R.vm.kill_object(model_encoder_layers_1_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_1_final_layer_norm_bias) + model_encoder_layers_1_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[29] + model_encoder_layers_1_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[30] + gv1985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1642: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1985, R.dtype("float16")) + _1640: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_1_fc1_weight, alloc1641, model_encoder_layers_1_fc1_bias, alloc1642) + R.vm.kill_object(alloc1641) + R.vm.kill_object(model_encoder_layers_1_fc1_weight) + R.vm.kill_object(model_encoder_layers_1_fc1_bias) + model_encoder_layers_1_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[31] + model_encoder_layers_1_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[32] + gv1986: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1643: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1986, R.dtype("float16")) + _1641: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_1_fc2_weight, alloc1642, model_encoder_layers_1_fc2_bias, alloc1643) + R.vm.kill_object(alloc1642) + R.vm.kill_object(model_encoder_layers_1_fc2_weight) + R.vm.kill_object(model_encoder_layers_1_fc2_bias) + gv1987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1644: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv1987, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1640, alloc1643, alloc1644) + R.vm.kill_object(alloc1640) + R.vm.kill_object(alloc1643) + model_encoder_layers_2_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[42] + model_encoder_layers_2_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[43] + gv1988: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1645: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1988, R.dtype("float16")) + cls.layer_norm1(alloc1644, model_encoder_layers_2_self_attn_layer_norm_weight, model_encoder_layers_2_self_attn_layer_norm_bias, alloc1645) + R.vm.kill_object(model_encoder_layers_2_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_2_self_attn_layer_norm_bias) + model_encoder_layers_2_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[38] + model_encoder_layers_2_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[39] + gv1989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1646: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv1989, R.dtype("float16")) + _1644: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_q_proj_weight, alloc1645, model_encoder_layers_2_self_attn_q_proj_bias, alloc1646) + R.vm.kill_object(model_encoder_layers_2_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_2_self_attn_q_proj_bias) + gv1990: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape16: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1646, gv1990, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1646) + model_encoder_layers_2_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[35] + gv1991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1647: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv1991, R.dtype("float16")) + _1645: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_2_self_attn_k_proj_weight, alloc1645, alloc1647) + R.vm.kill_object(model_encoder_layers_2_self_attn_k_proj_weight) + gv1992: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape17: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1647, gv1992, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1647) + model_encoder_layers_2_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[36] + model_encoder_layers_2_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[37] + gv1993: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1648: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv1993, R.dtype("float16")) + _1646: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_v_proj_weight, alloc1645, model_encoder_layers_2_self_attn_v_proj_bias, alloc1648) + R.vm.kill_object(alloc1645) + R.vm.kill_object(model_encoder_layers_2_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_2_self_attn_v_proj_bias) + gv1994: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape18: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1648, gv1994, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1648) + gv1995: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape19: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape16, gv1995, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape16) + gv1996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape20: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape17, gv1996, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape17) + gv1997: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape21: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape18, gv1997, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape18) + gv1998: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1649: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv1998, R.dtype("float16")) + _1647: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape19, reshape20, reshape21, alloc1649) + R.vm.kill_object(reshape19) + R.vm.kill_object(reshape20) + R.vm.kill_object(reshape21) + gv1999: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape22: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1649, gv1999, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1649) + gv2000: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape23: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape22, gv2000, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape22) + model_encoder_layers_2_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[40] + model_encoder_layers_2_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[41] + gv2001: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1650: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2001, R.dtype("float16")) + _1648: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_2_self_attn_out_proj_weight, reshape23, model_encoder_layers_2_self_attn_out_proj_bias, alloc1650) + R.vm.kill_object(reshape23) + R.vm.kill_object(model_encoder_layers_2_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_2_self_attn_out_proj_bias) + gv2002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1651: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2002, R.dtype("float16")) + cls.add4(alloc1644, alloc1650, alloc1651) + R.vm.kill_object(alloc1644) + R.vm.kill_object(alloc1650) + model_encoder_layers_2_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[48] + model_encoder_layers_2_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[49] + gv2003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1652: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2003, R.dtype("float16")) + cls.layer_norm1(alloc1651, model_encoder_layers_2_final_layer_norm_weight, model_encoder_layers_2_final_layer_norm_bias, alloc1652) + R.vm.kill_object(model_encoder_layers_2_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_2_final_layer_norm_bias) + model_encoder_layers_2_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[44] + model_encoder_layers_2_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[45] + gv2004: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1653: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2004, R.dtype("float16")) + _1651: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_2_fc1_weight, alloc1652, model_encoder_layers_2_fc1_bias, alloc1653) + R.vm.kill_object(alloc1652) + R.vm.kill_object(model_encoder_layers_2_fc1_weight) + R.vm.kill_object(model_encoder_layers_2_fc1_bias) + model_encoder_layers_2_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[46] + model_encoder_layers_2_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[47] + gv2005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1654: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2005, R.dtype("float16")) + _1652: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_2_fc2_weight, alloc1653, model_encoder_layers_2_fc2_bias, alloc1654) + R.vm.kill_object(alloc1653) + R.vm.kill_object(model_encoder_layers_2_fc2_weight) + R.vm.kill_object(model_encoder_layers_2_fc2_bias) + gv2006: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1655: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2006, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1651, alloc1654, alloc1655) + R.vm.kill_object(alloc1651) + R.vm.kill_object(alloc1654) + model_encoder_layers_3_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[57] + model_encoder_layers_3_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[58] + gv2007: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1656: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2007, R.dtype("float16")) + cls.layer_norm1(alloc1655, model_encoder_layers_3_self_attn_layer_norm_weight, model_encoder_layers_3_self_attn_layer_norm_bias, alloc1656) + R.vm.kill_object(model_encoder_layers_3_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_3_self_attn_layer_norm_bias) + model_encoder_layers_3_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[53] + model_encoder_layers_3_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[54] + gv2008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1657: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2008, R.dtype("float16")) + _1655: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_q_proj_weight, alloc1656, model_encoder_layers_3_self_attn_q_proj_bias, alloc1657) + R.vm.kill_object(model_encoder_layers_3_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_3_self_attn_q_proj_bias) + gv2009: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape24: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1657, gv2009, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1657) + model_encoder_layers_3_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[50] + gv2010: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1658: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2010, R.dtype("float16")) + _1656: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_3_self_attn_k_proj_weight, alloc1656, alloc1658) + R.vm.kill_object(model_encoder_layers_3_self_attn_k_proj_weight) + gv2011: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape25: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1658, gv2011, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1658) + model_encoder_layers_3_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[51] + model_encoder_layers_3_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[52] + gv2012: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1659: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2012, R.dtype("float16")) + _1657: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_v_proj_weight, alloc1656, model_encoder_layers_3_self_attn_v_proj_bias, alloc1659) + R.vm.kill_object(alloc1656) + R.vm.kill_object(model_encoder_layers_3_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_3_self_attn_v_proj_bias) + gv2013: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape26: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1659, gv2013, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1659) + gv2014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape27: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape24, gv2014, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape24) + gv2015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape28: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape25, gv2015, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape25) + gv2016: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape29: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape26, gv2016, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape26) + gv2017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1660: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2017, R.dtype("float16")) + _1658: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape27, reshape28, reshape29, alloc1660) + R.vm.kill_object(reshape27) + R.vm.kill_object(reshape28) + R.vm.kill_object(reshape29) + gv2018: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape30: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1660, gv2018, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1660) + gv2019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape31: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape30, gv2019, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape30) + model_encoder_layers_3_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[55] + model_encoder_layers_3_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[56] + gv2020: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1661: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2020, R.dtype("float16")) + _1659: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_3_self_attn_out_proj_weight, reshape31, model_encoder_layers_3_self_attn_out_proj_bias, alloc1661) + R.vm.kill_object(reshape31) + R.vm.kill_object(model_encoder_layers_3_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_3_self_attn_out_proj_bias) + gv2021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1662: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2021, R.dtype("float16")) + cls.add4(alloc1655, alloc1661, alloc1662) + R.vm.kill_object(alloc1655) + R.vm.kill_object(alloc1661) + model_encoder_layers_3_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[63] + model_encoder_layers_3_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[64] + gv2022: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1663: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2022, R.dtype("float16")) + cls.layer_norm1(alloc1662, model_encoder_layers_3_final_layer_norm_weight, model_encoder_layers_3_final_layer_norm_bias, alloc1663) + R.vm.kill_object(model_encoder_layers_3_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_3_final_layer_norm_bias) + model_encoder_layers_3_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[59] + model_encoder_layers_3_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[60] + gv2023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1664: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2023, R.dtype("float16")) + _1662: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_3_fc1_weight, alloc1663, model_encoder_layers_3_fc1_bias, alloc1664) + R.vm.kill_object(alloc1663) + R.vm.kill_object(model_encoder_layers_3_fc1_weight) + R.vm.kill_object(model_encoder_layers_3_fc1_bias) + model_encoder_layers_3_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[61] + model_encoder_layers_3_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[62] + gv2024: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1665: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2024, R.dtype("float16")) + _1663: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_3_fc2_weight, alloc1664, model_encoder_layers_3_fc2_bias, alloc1665) + R.vm.kill_object(alloc1664) + R.vm.kill_object(model_encoder_layers_3_fc2_weight) + R.vm.kill_object(model_encoder_layers_3_fc2_bias) + gv2025: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1666: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2025, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1662, alloc1665, alloc1666) + R.vm.kill_object(alloc1662) + R.vm.kill_object(alloc1665) + model_encoder_layers_4_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[72] + model_encoder_layers_4_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[73] + gv2026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1667: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2026, R.dtype("float16")) + cls.layer_norm1(alloc1666, model_encoder_layers_4_self_attn_layer_norm_weight, model_encoder_layers_4_self_attn_layer_norm_bias, alloc1667) + R.vm.kill_object(model_encoder_layers_4_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_4_self_attn_layer_norm_bias) + model_encoder_layers_4_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[68] + model_encoder_layers_4_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[69] + gv2027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1668: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2027, R.dtype("float16")) + _1666: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_q_proj_weight, alloc1667, model_encoder_layers_4_self_attn_q_proj_bias, alloc1668) + R.vm.kill_object(model_encoder_layers_4_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_4_self_attn_q_proj_bias) + gv2028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape32: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1668, gv2028, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1668) + model_encoder_layers_4_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[65] + gv2029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1669: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2029, R.dtype("float16")) + _1667: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_4_self_attn_k_proj_weight, alloc1667, alloc1669) + R.vm.kill_object(model_encoder_layers_4_self_attn_k_proj_weight) + gv2030: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape33: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1669, gv2030, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1669) + model_encoder_layers_4_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[66] + model_encoder_layers_4_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[67] + gv2031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1670: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2031, R.dtype("float16")) + _1668: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_v_proj_weight, alloc1667, model_encoder_layers_4_self_attn_v_proj_bias, alloc1670) + R.vm.kill_object(alloc1667) + R.vm.kill_object(model_encoder_layers_4_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_4_self_attn_v_proj_bias) + gv2032: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape34: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1670, gv2032, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1670) + gv2033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape35: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape32, gv2033, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape32) + gv2034: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape36: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape33, gv2034, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape33) + gv2035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape37: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape34, gv2035, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape34) + gv2036: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1671: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2036, R.dtype("float16")) + _1669: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape35, reshape36, reshape37, alloc1671) + R.vm.kill_object(reshape35) + R.vm.kill_object(reshape36) + R.vm.kill_object(reshape37) + gv2037: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape38: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1671, gv2037, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1671) + gv2038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape39: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape38, gv2038, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape38) + model_encoder_layers_4_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[70] + model_encoder_layers_4_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[71] + gv2039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1672: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2039, R.dtype("float16")) + _1670: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_4_self_attn_out_proj_weight, reshape39, model_encoder_layers_4_self_attn_out_proj_bias, alloc1672) + R.vm.kill_object(reshape39) + R.vm.kill_object(model_encoder_layers_4_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_4_self_attn_out_proj_bias) + gv2040: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1673: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2040, R.dtype("float16")) + cls.add4(alloc1666, alloc1672, alloc1673) + R.vm.kill_object(alloc1666) + R.vm.kill_object(alloc1672) + model_encoder_layers_4_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[78] + model_encoder_layers_4_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[79] + gv2041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1674: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2041, R.dtype("float16")) + cls.layer_norm1(alloc1673, model_encoder_layers_4_final_layer_norm_weight, model_encoder_layers_4_final_layer_norm_bias, alloc1674) + R.vm.kill_object(model_encoder_layers_4_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_4_final_layer_norm_bias) + model_encoder_layers_4_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[74] + model_encoder_layers_4_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[75] + gv2042: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1675: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2042, R.dtype("float16")) + _1673: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_4_fc1_weight, alloc1674, model_encoder_layers_4_fc1_bias, alloc1675) + R.vm.kill_object(alloc1674) + R.vm.kill_object(model_encoder_layers_4_fc1_weight) + R.vm.kill_object(model_encoder_layers_4_fc1_bias) + model_encoder_layers_4_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[76] + model_encoder_layers_4_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[77] + gv2043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1676: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2043, R.dtype("float16")) + _1674: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_4_fc2_weight, alloc1675, model_encoder_layers_4_fc2_bias, alloc1676) + R.vm.kill_object(alloc1675) + R.vm.kill_object(model_encoder_layers_4_fc2_weight) + R.vm.kill_object(model_encoder_layers_4_fc2_bias) + gv2044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1677: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2044, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1673, alloc1676, alloc1677) + R.vm.kill_object(alloc1673) + R.vm.kill_object(alloc1676) + model_encoder_layers_5_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[87] + model_encoder_layers_5_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[88] + gv2045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1678: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2045, R.dtype("float16")) + cls.layer_norm1(alloc1677, model_encoder_layers_5_self_attn_layer_norm_weight, model_encoder_layers_5_self_attn_layer_norm_bias, alloc1678) + R.vm.kill_object(model_encoder_layers_5_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_5_self_attn_layer_norm_bias) + model_encoder_layers_5_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[83] + model_encoder_layers_5_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[84] + gv2046: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1679: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2046, R.dtype("float16")) + _1677: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_q_proj_weight, alloc1678, model_encoder_layers_5_self_attn_q_proj_bias, alloc1679) + R.vm.kill_object(model_encoder_layers_5_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_5_self_attn_q_proj_bias) + gv2047: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape40: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1679, gv2047, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1679) + model_encoder_layers_5_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[80] + gv2048: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1680: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2048, R.dtype("float16")) + _1678: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_5_self_attn_k_proj_weight, alloc1678, alloc1680) + R.vm.kill_object(model_encoder_layers_5_self_attn_k_proj_weight) + gv2049: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape41: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1680, gv2049, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1680) + model_encoder_layers_5_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[81] + model_encoder_layers_5_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[82] + gv2050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1681: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2050, R.dtype("float16")) + _1679: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_v_proj_weight, alloc1678, model_encoder_layers_5_self_attn_v_proj_bias, alloc1681) + R.vm.kill_object(alloc1678) + R.vm.kill_object(model_encoder_layers_5_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_5_self_attn_v_proj_bias) + gv2051: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape42: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1681, gv2051, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1681) + gv2052: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape43: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape40, gv2052, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape40) + gv2053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape44: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape41, gv2053, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape41) + gv2054: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape45: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape42, gv2054, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape42) + gv2055: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1682: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2055, R.dtype("float16")) + _1680: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape43, reshape44, reshape45, alloc1682) + R.vm.kill_object(reshape43) + R.vm.kill_object(reshape44) + R.vm.kill_object(reshape45) + gv2056: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape46: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1682, gv2056, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1682) + gv2057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape47: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape46, gv2057, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape46) + model_encoder_layers_5_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[85] + model_encoder_layers_5_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[86] + gv2058: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1683: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2058, R.dtype("float16")) + _1681: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_5_self_attn_out_proj_weight, reshape47, model_encoder_layers_5_self_attn_out_proj_bias, alloc1683) + R.vm.kill_object(reshape47) + R.vm.kill_object(model_encoder_layers_5_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_5_self_attn_out_proj_bias) + gv2059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1684: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2059, R.dtype("float16")) + cls.add4(alloc1677, alloc1683, alloc1684) + R.vm.kill_object(alloc1677) + R.vm.kill_object(alloc1683) + model_encoder_layers_5_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[93] + model_encoder_layers_5_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[94] + gv2060: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1685: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2060, R.dtype("float16")) + cls.layer_norm1(alloc1684, model_encoder_layers_5_final_layer_norm_weight, model_encoder_layers_5_final_layer_norm_bias, alloc1685) + R.vm.kill_object(model_encoder_layers_5_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_5_final_layer_norm_bias) + model_encoder_layers_5_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[89] + model_encoder_layers_5_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[90] + gv2061: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1686: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2061, R.dtype("float16")) + _1684: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_5_fc1_weight, alloc1685, model_encoder_layers_5_fc1_bias, alloc1686) + R.vm.kill_object(alloc1685) + R.vm.kill_object(model_encoder_layers_5_fc1_weight) + R.vm.kill_object(model_encoder_layers_5_fc1_bias) + model_encoder_layers_5_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[91] + model_encoder_layers_5_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[92] + gv2062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1687: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2062, R.dtype("float16")) + _1685: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_5_fc2_weight, alloc1686, model_encoder_layers_5_fc2_bias, alloc1687) + R.vm.kill_object(alloc1686) + R.vm.kill_object(model_encoder_layers_5_fc2_weight) + R.vm.kill_object(model_encoder_layers_5_fc2_bias) + gv2063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1688: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2063, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1684, alloc1687, alloc1688) + R.vm.kill_object(alloc1684) + R.vm.kill_object(alloc1687) + model_encoder_layers_6_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[102] + model_encoder_layers_6_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[103] + gv2064: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1689: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2064, R.dtype("float16")) + cls.layer_norm1(alloc1688, model_encoder_layers_6_self_attn_layer_norm_weight, model_encoder_layers_6_self_attn_layer_norm_bias, alloc1689) + R.vm.kill_object(model_encoder_layers_6_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_6_self_attn_layer_norm_bias) + model_encoder_layers_6_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[98] + model_encoder_layers_6_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[99] + gv2065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1690: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2065, R.dtype("float16")) + _1688: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_q_proj_weight, alloc1689, model_encoder_layers_6_self_attn_q_proj_bias, alloc1690) + R.vm.kill_object(model_encoder_layers_6_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_6_self_attn_q_proj_bias) + gv2066: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape48: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1690, gv2066, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1690) + model_encoder_layers_6_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[95] + gv2067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1691: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2067, R.dtype("float16")) + _1689: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_6_self_attn_k_proj_weight, alloc1689, alloc1691) + R.vm.kill_object(model_encoder_layers_6_self_attn_k_proj_weight) + gv2068: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape49: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1691, gv2068, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1691) + model_encoder_layers_6_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[96] + model_encoder_layers_6_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[97] + gv2069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1692: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2069, R.dtype("float16")) + _1690: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_v_proj_weight, alloc1689, model_encoder_layers_6_self_attn_v_proj_bias, alloc1692) + R.vm.kill_object(alloc1689) + R.vm.kill_object(model_encoder_layers_6_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_6_self_attn_v_proj_bias) + gv2070: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape50: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1692, gv2070, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1692) + gv2071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape51: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape48, gv2071, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape48) + gv2072: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape52: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape49, gv2072, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape49) + gv2073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape53: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape50, gv2073, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape50) + gv2074: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1693: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2074, R.dtype("float16")) + _1691: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape51, reshape52, reshape53, alloc1693) + R.vm.kill_object(reshape51) + R.vm.kill_object(reshape52) + R.vm.kill_object(reshape53) + gv2075: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape54: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1693, gv2075, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1693) + gv2076: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape55: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape54, gv2076, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape54) + model_encoder_layers_6_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[100] + model_encoder_layers_6_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[101] + gv2077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1694: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2077, R.dtype("float16")) + _1692: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_6_self_attn_out_proj_weight, reshape55, model_encoder_layers_6_self_attn_out_proj_bias, alloc1694) + R.vm.kill_object(reshape55) + R.vm.kill_object(model_encoder_layers_6_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_6_self_attn_out_proj_bias) + gv2078: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1695: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2078, R.dtype("float16")) + cls.add4(alloc1688, alloc1694, alloc1695) + R.vm.kill_object(alloc1688) + R.vm.kill_object(alloc1694) + model_encoder_layers_6_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[108] + model_encoder_layers_6_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[109] + gv2079: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1696: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2079, R.dtype("float16")) + cls.layer_norm1(alloc1695, model_encoder_layers_6_final_layer_norm_weight, model_encoder_layers_6_final_layer_norm_bias, alloc1696) + R.vm.kill_object(model_encoder_layers_6_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_6_final_layer_norm_bias) + model_encoder_layers_6_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[104] + model_encoder_layers_6_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[105] + gv2080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1697: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2080, R.dtype("float16")) + _1695: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_6_fc1_weight, alloc1696, model_encoder_layers_6_fc1_bias, alloc1697) + R.vm.kill_object(alloc1696) + R.vm.kill_object(model_encoder_layers_6_fc1_weight) + R.vm.kill_object(model_encoder_layers_6_fc1_bias) + model_encoder_layers_6_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[106] + model_encoder_layers_6_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[107] + gv2081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1698: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2081, R.dtype("float16")) + _1696: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_6_fc2_weight, alloc1697, model_encoder_layers_6_fc2_bias, alloc1698) + R.vm.kill_object(alloc1697) + R.vm.kill_object(model_encoder_layers_6_fc2_weight) + R.vm.kill_object(model_encoder_layers_6_fc2_bias) + gv2082: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1699: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2082, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1695, alloc1698, alloc1699) + R.vm.kill_object(alloc1695) + R.vm.kill_object(alloc1698) + model_encoder_layers_7_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[117] + model_encoder_layers_7_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[118] + gv2083: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1700: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2083, R.dtype("float16")) + cls.layer_norm1(alloc1699, model_encoder_layers_7_self_attn_layer_norm_weight, model_encoder_layers_7_self_attn_layer_norm_bias, alloc1700) + R.vm.kill_object(model_encoder_layers_7_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_7_self_attn_layer_norm_bias) + model_encoder_layers_7_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[113] + model_encoder_layers_7_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[114] + gv2084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1701: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2084, R.dtype("float16")) + _1699: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_q_proj_weight, alloc1700, model_encoder_layers_7_self_attn_q_proj_bias, alloc1701) + R.vm.kill_object(model_encoder_layers_7_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_7_self_attn_q_proj_bias) + gv2085: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape56: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1701, gv2085, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1701) + model_encoder_layers_7_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[110] + gv2086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1702: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2086, R.dtype("float16")) + _1700: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_7_self_attn_k_proj_weight, alloc1700, alloc1702) + R.vm.kill_object(model_encoder_layers_7_self_attn_k_proj_weight) + gv2087: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape57: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1702, gv2087, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1702) + model_encoder_layers_7_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[111] + model_encoder_layers_7_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[112] + gv2088: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1703: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2088, R.dtype("float16")) + _1701: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_v_proj_weight, alloc1700, model_encoder_layers_7_self_attn_v_proj_bias, alloc1703) + R.vm.kill_object(alloc1700) + R.vm.kill_object(model_encoder_layers_7_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_7_self_attn_v_proj_bias) + gv2089: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape58: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1703, gv2089, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1703) + gv2090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape59: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape56, gv2090, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape56) + gv2091: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape60: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape57, gv2091, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape57) + gv2092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape61: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape58, gv2092, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape58) + gv2093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1704: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2093, R.dtype("float16")) + _1702: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape59, reshape60, reshape61, alloc1704) + R.vm.kill_object(reshape59) + R.vm.kill_object(reshape60) + R.vm.kill_object(reshape61) + gv2094: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape62: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1704, gv2094, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1704) + gv2095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape63: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape62, gv2095, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape62) + model_encoder_layers_7_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[115] + model_encoder_layers_7_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[116] + gv2096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1705: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2096, R.dtype("float16")) + _1703: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_7_self_attn_out_proj_weight, reshape63, model_encoder_layers_7_self_attn_out_proj_bias, alloc1705) + R.vm.kill_object(reshape63) + R.vm.kill_object(model_encoder_layers_7_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_7_self_attn_out_proj_bias) + gv2097: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1706: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2097, R.dtype("float16")) + cls.add4(alloc1699, alloc1705, alloc1706) + R.vm.kill_object(alloc1699) + R.vm.kill_object(alloc1705) + model_encoder_layers_7_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[123] + model_encoder_layers_7_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[124] + gv2098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1707: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2098, R.dtype("float16")) + cls.layer_norm1(alloc1706, model_encoder_layers_7_final_layer_norm_weight, model_encoder_layers_7_final_layer_norm_bias, alloc1707) + R.vm.kill_object(model_encoder_layers_7_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_7_final_layer_norm_bias) + model_encoder_layers_7_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[119] + model_encoder_layers_7_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[120] + gv2099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1708: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2099, R.dtype("float16")) + _1706: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_7_fc1_weight, alloc1707, model_encoder_layers_7_fc1_bias, alloc1708) + R.vm.kill_object(alloc1707) + R.vm.kill_object(model_encoder_layers_7_fc1_weight) + R.vm.kill_object(model_encoder_layers_7_fc1_bias) + model_encoder_layers_7_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[121] + model_encoder_layers_7_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[122] + gv2100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1709: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2100, R.dtype("float16")) + _1707: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_7_fc2_weight, alloc1708, model_encoder_layers_7_fc2_bias, alloc1709) + R.vm.kill_object(alloc1708) + R.vm.kill_object(model_encoder_layers_7_fc2_weight) + R.vm.kill_object(model_encoder_layers_7_fc2_bias) + gv2101: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1710: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2101, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1706, alloc1709, alloc1710) + R.vm.kill_object(alloc1706) + R.vm.kill_object(alloc1709) + model_encoder_layers_8_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[132] + model_encoder_layers_8_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[133] + gv2102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1711: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2102, R.dtype("float16")) + cls.layer_norm1(alloc1710, model_encoder_layers_8_self_attn_layer_norm_weight, model_encoder_layers_8_self_attn_layer_norm_bias, alloc1711) + R.vm.kill_object(model_encoder_layers_8_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_8_self_attn_layer_norm_bias) + model_encoder_layers_8_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[128] + model_encoder_layers_8_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[129] + gv2103: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1712: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2103, R.dtype("float16")) + _1710: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_q_proj_weight, alloc1711, model_encoder_layers_8_self_attn_q_proj_bias, alloc1712) + R.vm.kill_object(model_encoder_layers_8_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_8_self_attn_q_proj_bias) + gv2104: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape64: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1712, gv2104, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1712) + model_encoder_layers_8_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[125] + gv2105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1713: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2105, R.dtype("float16")) + _1711: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_8_self_attn_k_proj_weight, alloc1711, alloc1713) + R.vm.kill_object(model_encoder_layers_8_self_attn_k_proj_weight) + gv2106: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape65: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1713, gv2106, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1713) + model_encoder_layers_8_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[126] + model_encoder_layers_8_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[127] + gv2107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1714: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2107, R.dtype("float16")) + _1712: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_v_proj_weight, alloc1711, model_encoder_layers_8_self_attn_v_proj_bias, alloc1714) + R.vm.kill_object(alloc1711) + R.vm.kill_object(model_encoder_layers_8_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_8_self_attn_v_proj_bias) + gv2108: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape66: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1714, gv2108, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1714) + gv2109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape67: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape64, gv2109, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape64) + gv2110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape68: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape65, gv2110, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape65) + gv2111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape69: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape66, gv2111, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape66) + gv2112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1715: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2112, R.dtype("float16")) + _1713: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape67, reshape68, reshape69, alloc1715) + R.vm.kill_object(reshape67) + R.vm.kill_object(reshape68) + R.vm.kill_object(reshape69) + gv2113: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape70: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1715, gv2113, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1715) + gv2114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape71: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape70, gv2114, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape70) + model_encoder_layers_8_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[130] + model_encoder_layers_8_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[131] + gv2115: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1716: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2115, R.dtype("float16")) + _1714: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_8_self_attn_out_proj_weight, reshape71, model_encoder_layers_8_self_attn_out_proj_bias, alloc1716) + R.vm.kill_object(reshape71) + R.vm.kill_object(model_encoder_layers_8_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_8_self_attn_out_proj_bias) + gv2116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1717: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2116, R.dtype("float16")) + cls.add4(alloc1710, alloc1716, alloc1717) + R.vm.kill_object(alloc1710) + R.vm.kill_object(alloc1716) + model_encoder_layers_8_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[138] + model_encoder_layers_8_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[139] + gv2117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1718: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2117, R.dtype("float16")) + cls.layer_norm1(alloc1717, model_encoder_layers_8_final_layer_norm_weight, model_encoder_layers_8_final_layer_norm_bias, alloc1718) + R.vm.kill_object(model_encoder_layers_8_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_8_final_layer_norm_bias) + model_encoder_layers_8_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[134] + model_encoder_layers_8_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[135] + gv2118: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1719: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2118, R.dtype("float16")) + _1717: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_8_fc1_weight, alloc1718, model_encoder_layers_8_fc1_bias, alloc1719) + R.vm.kill_object(alloc1718) + R.vm.kill_object(model_encoder_layers_8_fc1_weight) + R.vm.kill_object(model_encoder_layers_8_fc1_bias) + model_encoder_layers_8_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[136] + model_encoder_layers_8_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[137] + gv2119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1720: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2119, R.dtype("float16")) + _1718: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_8_fc2_weight, alloc1719, model_encoder_layers_8_fc2_bias, alloc1720) + R.vm.kill_object(alloc1719) + R.vm.kill_object(model_encoder_layers_8_fc2_weight) + R.vm.kill_object(model_encoder_layers_8_fc2_bias) + gv2120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1721: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2120, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1717, alloc1720, alloc1721) + R.vm.kill_object(alloc1717) + R.vm.kill_object(alloc1720) + model_encoder_layers_9_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[147] + model_encoder_layers_9_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[148] + gv2121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1722: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2121, R.dtype("float16")) + cls.layer_norm1(alloc1721, model_encoder_layers_9_self_attn_layer_norm_weight, model_encoder_layers_9_self_attn_layer_norm_bias, alloc1722) + R.vm.kill_object(model_encoder_layers_9_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_9_self_attn_layer_norm_bias) + model_encoder_layers_9_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[143] + model_encoder_layers_9_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[144] + gv2122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1723: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2122, R.dtype("float16")) + _1721: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_q_proj_weight, alloc1722, model_encoder_layers_9_self_attn_q_proj_bias, alloc1723) + R.vm.kill_object(model_encoder_layers_9_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_9_self_attn_q_proj_bias) + gv2123: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape72: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1723, gv2123, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1723) + model_encoder_layers_9_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[140] + gv2124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1724: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2124, R.dtype("float16")) + _1722: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_9_self_attn_k_proj_weight, alloc1722, alloc1724) + R.vm.kill_object(model_encoder_layers_9_self_attn_k_proj_weight) + gv2125: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape73: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1724, gv2125, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1724) + model_encoder_layers_9_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[141] + model_encoder_layers_9_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[142] + gv2126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1725: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2126, R.dtype("float16")) + _1723: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_v_proj_weight, alloc1722, model_encoder_layers_9_self_attn_v_proj_bias, alloc1725) + R.vm.kill_object(alloc1722) + R.vm.kill_object(model_encoder_layers_9_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_9_self_attn_v_proj_bias) + gv2127: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape74: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1725, gv2127, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1725) + gv2128: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape75: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape72, gv2128, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape72) + gv2129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape76: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape73, gv2129, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape73) + gv2130: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape77: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape74, gv2130, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape74) + gv2131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1726: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2131, R.dtype("float16")) + _1724: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape75, reshape76, reshape77, alloc1726) + R.vm.kill_object(reshape75) + R.vm.kill_object(reshape76) + R.vm.kill_object(reshape77) + gv2132: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape78: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1726, gv2132, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1726) + gv2133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape79: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape78, gv2133, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape78) + model_encoder_layers_9_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[145] + model_encoder_layers_9_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[146] + gv2134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1727: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2134, R.dtype("float16")) + _1725: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_9_self_attn_out_proj_weight, reshape79, model_encoder_layers_9_self_attn_out_proj_bias, alloc1727) + R.vm.kill_object(reshape79) + R.vm.kill_object(model_encoder_layers_9_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_9_self_attn_out_proj_bias) + gv2135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1728: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2135, R.dtype("float16")) + cls.add4(alloc1721, alloc1727, alloc1728) + R.vm.kill_object(alloc1721) + R.vm.kill_object(alloc1727) + model_encoder_layers_9_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[153] + model_encoder_layers_9_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[154] + gv2136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1729: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2136, R.dtype("float16")) + cls.layer_norm1(alloc1728, model_encoder_layers_9_final_layer_norm_weight, model_encoder_layers_9_final_layer_norm_bias, alloc1729) + R.vm.kill_object(model_encoder_layers_9_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_9_final_layer_norm_bias) + model_encoder_layers_9_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[149] + model_encoder_layers_9_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[150] + gv2137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1730: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2137, R.dtype("float16")) + _1728: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_9_fc1_weight, alloc1729, model_encoder_layers_9_fc1_bias, alloc1730) + R.vm.kill_object(alloc1729) + R.vm.kill_object(model_encoder_layers_9_fc1_weight) + R.vm.kill_object(model_encoder_layers_9_fc1_bias) + model_encoder_layers_9_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[151] + model_encoder_layers_9_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[152] + gv2138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1731: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2138, R.dtype("float16")) + _1729: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_9_fc2_weight, alloc1730, model_encoder_layers_9_fc2_bias, alloc1731) + R.vm.kill_object(alloc1730) + R.vm.kill_object(model_encoder_layers_9_fc2_weight) + R.vm.kill_object(model_encoder_layers_9_fc2_bias) + gv2139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1732: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2139, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1728, alloc1731, alloc1732) + R.vm.kill_object(alloc1728) + R.vm.kill_object(alloc1731) + model_encoder_layers_10_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[162] + model_encoder_layers_10_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[163] + gv2140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1733: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2140, R.dtype("float16")) + cls.layer_norm1(alloc1732, model_encoder_layers_10_self_attn_layer_norm_weight, model_encoder_layers_10_self_attn_layer_norm_bias, alloc1733) + R.vm.kill_object(model_encoder_layers_10_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_10_self_attn_layer_norm_bias) + model_encoder_layers_10_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[158] + model_encoder_layers_10_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[159] + gv2141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1734: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2141, R.dtype("float16")) + _1732: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_q_proj_weight, alloc1733, model_encoder_layers_10_self_attn_q_proj_bias, alloc1734) + R.vm.kill_object(model_encoder_layers_10_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_10_self_attn_q_proj_bias) + gv2142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape80: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1734, gv2142, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1734) + model_encoder_layers_10_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[155] + gv2143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1735: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2143, R.dtype("float16")) + _1733: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_10_self_attn_k_proj_weight, alloc1733, alloc1735) + R.vm.kill_object(model_encoder_layers_10_self_attn_k_proj_weight) + gv2144: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape81: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1735, gv2144, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1735) + model_encoder_layers_10_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[156] + model_encoder_layers_10_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[157] + gv2145: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1736: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2145, R.dtype("float16")) + _1734: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_v_proj_weight, alloc1733, model_encoder_layers_10_self_attn_v_proj_bias, alloc1736) + R.vm.kill_object(alloc1733) + R.vm.kill_object(model_encoder_layers_10_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_10_self_attn_v_proj_bias) + gv2146: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape82: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1736, gv2146, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1736) + gv2147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape83: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape80, gv2147, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape80) + gv2148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape84: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape81, gv2148, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape81) + gv2149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape85: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape82, gv2149, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape82) + gv2150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1737: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2150, R.dtype("float16")) + _1735: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape83, reshape84, reshape85, alloc1737) + R.vm.kill_object(reshape83) + R.vm.kill_object(reshape84) + R.vm.kill_object(reshape85) + gv2151: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape86: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1737, gv2151, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1737) + gv2152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape87: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape86, gv2152, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape86) + model_encoder_layers_10_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[160] + model_encoder_layers_10_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[161] + gv2153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1738: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2153, R.dtype("float16")) + _1736: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_10_self_attn_out_proj_weight, reshape87, model_encoder_layers_10_self_attn_out_proj_bias, alloc1738) + R.vm.kill_object(reshape87) + R.vm.kill_object(model_encoder_layers_10_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_10_self_attn_out_proj_bias) + gv2154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1739: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2154, R.dtype("float16")) + cls.add4(alloc1732, alloc1738, alloc1739) + R.vm.kill_object(alloc1732) + R.vm.kill_object(alloc1738) + model_encoder_layers_10_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[168] + model_encoder_layers_10_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[169] + gv2155: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1740: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2155, R.dtype("float16")) + cls.layer_norm1(alloc1739, model_encoder_layers_10_final_layer_norm_weight, model_encoder_layers_10_final_layer_norm_bias, alloc1740) + R.vm.kill_object(model_encoder_layers_10_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_10_final_layer_norm_bias) + model_encoder_layers_10_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[164] + model_encoder_layers_10_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[165] + gv2156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1741: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2156, R.dtype("float16")) + _1739: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_10_fc1_weight, alloc1740, model_encoder_layers_10_fc1_bias, alloc1741) + R.vm.kill_object(alloc1740) + R.vm.kill_object(model_encoder_layers_10_fc1_weight) + R.vm.kill_object(model_encoder_layers_10_fc1_bias) + model_encoder_layers_10_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[166] + model_encoder_layers_10_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[167] + gv2157: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1742: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2157, R.dtype("float16")) + _1740: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_10_fc2_weight, alloc1741, model_encoder_layers_10_fc2_bias, alloc1742) + R.vm.kill_object(alloc1741) + R.vm.kill_object(model_encoder_layers_10_fc2_weight) + R.vm.kill_object(model_encoder_layers_10_fc2_bias) + gv2158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1743: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2158, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1739, alloc1742, alloc1743) + R.vm.kill_object(alloc1739) + R.vm.kill_object(alloc1742) + model_encoder_layers_11_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[177] + model_encoder_layers_11_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[178] + gv2159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1744: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2159, R.dtype("float16")) + cls.layer_norm1(alloc1743, model_encoder_layers_11_self_attn_layer_norm_weight, model_encoder_layers_11_self_attn_layer_norm_bias, alloc1744) + R.vm.kill_object(model_encoder_layers_11_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_11_self_attn_layer_norm_bias) + model_encoder_layers_11_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[173] + model_encoder_layers_11_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[174] + gv2160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1745: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2160, R.dtype("float16")) + _1743: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_q_proj_weight, alloc1744, model_encoder_layers_11_self_attn_q_proj_bias, alloc1745) + R.vm.kill_object(model_encoder_layers_11_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_11_self_attn_q_proj_bias) + gv2161: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape88: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1745, gv2161, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1745) + model_encoder_layers_11_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[170] + gv2162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1746: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2162, R.dtype("float16")) + _1744: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_11_self_attn_k_proj_weight, alloc1744, alloc1746) + R.vm.kill_object(model_encoder_layers_11_self_attn_k_proj_weight) + gv2163: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape89: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1746, gv2163, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1746) + model_encoder_layers_11_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[171] + model_encoder_layers_11_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[172] + gv2164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1747: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2164, R.dtype("float16")) + _1745: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_v_proj_weight, alloc1744, model_encoder_layers_11_self_attn_v_proj_bias, alloc1747) + R.vm.kill_object(alloc1744) + R.vm.kill_object(model_encoder_layers_11_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_11_self_attn_v_proj_bias) + gv2165: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape90: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1747, gv2165, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1747) + gv2166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape91: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape88, gv2166, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape88) + gv2167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape92: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape89, gv2167, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape89) + gv2168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape93: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape90, gv2168, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape90) + gv2169: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1748: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2169, R.dtype("float16")) + _1746: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape91, reshape92, reshape93, alloc1748) + R.vm.kill_object(reshape91) + R.vm.kill_object(reshape92) + R.vm.kill_object(reshape93) + gv2170: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape94: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1748, gv2170, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1748) + gv2171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape95: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape94, gv2171, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape94) + model_encoder_layers_11_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[175] + model_encoder_layers_11_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[176] + gv2172: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1749: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2172, R.dtype("float16")) + _1747: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_11_self_attn_out_proj_weight, reshape95, model_encoder_layers_11_self_attn_out_proj_bias, alloc1749) + R.vm.kill_object(reshape95) + R.vm.kill_object(model_encoder_layers_11_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_11_self_attn_out_proj_bias) + gv2173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1750: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2173, R.dtype("float16")) + cls.add4(alloc1743, alloc1749, alloc1750) + R.vm.kill_object(alloc1743) + R.vm.kill_object(alloc1749) + model_encoder_layers_11_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[183] + model_encoder_layers_11_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[184] + gv2174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1751: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2174, R.dtype("float16")) + cls.layer_norm1(alloc1750, model_encoder_layers_11_final_layer_norm_weight, model_encoder_layers_11_final_layer_norm_bias, alloc1751) + R.vm.kill_object(model_encoder_layers_11_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_11_final_layer_norm_bias) + model_encoder_layers_11_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[179] + model_encoder_layers_11_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[180] + gv2175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1752: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2175, R.dtype("float16")) + _1750: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_11_fc1_weight, alloc1751, model_encoder_layers_11_fc1_bias, alloc1752) + R.vm.kill_object(alloc1751) + R.vm.kill_object(model_encoder_layers_11_fc1_weight) + R.vm.kill_object(model_encoder_layers_11_fc1_bias) + model_encoder_layers_11_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[181] + model_encoder_layers_11_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[182] + gv2176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1753: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2176, R.dtype("float16")) + _1751: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_11_fc2_weight, alloc1752, model_encoder_layers_11_fc2_bias, alloc1753) + R.vm.kill_object(alloc1752) + R.vm.kill_object(model_encoder_layers_11_fc2_weight) + R.vm.kill_object(model_encoder_layers_11_fc2_bias) + gv2177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1754: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2177, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1750, alloc1753, alloc1754) + R.vm.kill_object(alloc1750) + R.vm.kill_object(alloc1753) + model_encoder_layers_12_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[192] + model_encoder_layers_12_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[193] + gv2178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1755: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2178, R.dtype("float16")) + cls.layer_norm1(alloc1754, model_encoder_layers_12_self_attn_layer_norm_weight, model_encoder_layers_12_self_attn_layer_norm_bias, alloc1755) + R.vm.kill_object(model_encoder_layers_12_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_12_self_attn_layer_norm_bias) + model_encoder_layers_12_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[188] + model_encoder_layers_12_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[189] + gv2179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1756: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2179, R.dtype("float16")) + _1754: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_q_proj_weight, alloc1755, model_encoder_layers_12_self_attn_q_proj_bias, alloc1756) + R.vm.kill_object(model_encoder_layers_12_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_12_self_attn_q_proj_bias) + gv2180: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape96: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1756, gv2180, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1756) + model_encoder_layers_12_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[185] + gv2181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1757: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2181, R.dtype("float16")) + _1755: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_12_self_attn_k_proj_weight, alloc1755, alloc1757) + R.vm.kill_object(model_encoder_layers_12_self_attn_k_proj_weight) + gv2182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape97: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1757, gv2182, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1757) + model_encoder_layers_12_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[186] + model_encoder_layers_12_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[187] + gv2183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1758: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2183, R.dtype("float16")) + _1756: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_v_proj_weight, alloc1755, model_encoder_layers_12_self_attn_v_proj_bias, alloc1758) + R.vm.kill_object(alloc1755) + R.vm.kill_object(model_encoder_layers_12_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_12_self_attn_v_proj_bias) + gv2184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape98: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1758, gv2184, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1758) + gv2185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape99: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape96, gv2185, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape96) + gv2186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape100: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape97, gv2186, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape97) + gv2187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape101: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape98, gv2187, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape98) + gv2188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1759: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2188, R.dtype("float16")) + _1757: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape99, reshape100, reshape101, alloc1759) + R.vm.kill_object(reshape99) + R.vm.kill_object(reshape100) + R.vm.kill_object(reshape101) + gv2189: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape102: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1759, gv2189, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1759) + gv2190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape103: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape102, gv2190, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape102) + model_encoder_layers_12_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[190] + model_encoder_layers_12_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[191] + gv2191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1760: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2191, R.dtype("float16")) + _1758: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_12_self_attn_out_proj_weight, reshape103, model_encoder_layers_12_self_attn_out_proj_bias, alloc1760) + R.vm.kill_object(reshape103) + R.vm.kill_object(model_encoder_layers_12_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_12_self_attn_out_proj_bias) + gv2192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1761: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2192, R.dtype("float16")) + cls.add4(alloc1754, alloc1760, alloc1761) + R.vm.kill_object(alloc1754) + R.vm.kill_object(alloc1760) + model_encoder_layers_12_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[198] + model_encoder_layers_12_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[199] + gv2193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1762: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2193, R.dtype("float16")) + cls.layer_norm1(alloc1761, model_encoder_layers_12_final_layer_norm_weight, model_encoder_layers_12_final_layer_norm_bias, alloc1762) + R.vm.kill_object(model_encoder_layers_12_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_12_final_layer_norm_bias) + model_encoder_layers_12_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[194] + model_encoder_layers_12_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[195] + gv2194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1763: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2194, R.dtype("float16")) + _1761: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_12_fc1_weight, alloc1762, model_encoder_layers_12_fc1_bias, alloc1763) + R.vm.kill_object(alloc1762) + R.vm.kill_object(model_encoder_layers_12_fc1_weight) + R.vm.kill_object(model_encoder_layers_12_fc1_bias) + model_encoder_layers_12_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[196] + model_encoder_layers_12_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[197] + gv2195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1764: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2195, R.dtype("float16")) + _1762: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_12_fc2_weight, alloc1763, model_encoder_layers_12_fc2_bias, alloc1764) + R.vm.kill_object(alloc1763) + R.vm.kill_object(model_encoder_layers_12_fc2_weight) + R.vm.kill_object(model_encoder_layers_12_fc2_bias) + gv2196: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1765: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2196, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1761, alloc1764, alloc1765) + R.vm.kill_object(alloc1761) + R.vm.kill_object(alloc1764) + model_encoder_layers_13_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[207] + model_encoder_layers_13_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[208] + gv2197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1766: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2197, R.dtype("float16")) + cls.layer_norm1(alloc1765, model_encoder_layers_13_self_attn_layer_norm_weight, model_encoder_layers_13_self_attn_layer_norm_bias, alloc1766) + R.vm.kill_object(model_encoder_layers_13_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_13_self_attn_layer_norm_bias) + model_encoder_layers_13_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[203] + model_encoder_layers_13_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[204] + gv2198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1767: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2198, R.dtype("float16")) + _1765: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_q_proj_weight, alloc1766, model_encoder_layers_13_self_attn_q_proj_bias, alloc1767) + R.vm.kill_object(model_encoder_layers_13_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_13_self_attn_q_proj_bias) + gv2199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape104: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1767, gv2199, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1767) + model_encoder_layers_13_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[200] + gv2200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1768: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2200, R.dtype("float16")) + _1766: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_13_self_attn_k_proj_weight, alloc1766, alloc1768) + R.vm.kill_object(model_encoder_layers_13_self_attn_k_proj_weight) + gv2201: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape105: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1768, gv2201, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1768) + model_encoder_layers_13_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[201] + model_encoder_layers_13_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[202] + gv2202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1769: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2202, R.dtype("float16")) + _1767: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_v_proj_weight, alloc1766, model_encoder_layers_13_self_attn_v_proj_bias, alloc1769) + R.vm.kill_object(alloc1766) + R.vm.kill_object(model_encoder_layers_13_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_13_self_attn_v_proj_bias) + gv2203: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape106: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1769, gv2203, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1769) + gv2204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape107: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape104, gv2204, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape104) + gv2205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape108: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape105, gv2205, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape105) + gv2206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape109: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape106, gv2206, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape106) + gv2207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1770: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2207, R.dtype("float16")) + _1768: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape107, reshape108, reshape109, alloc1770) + R.vm.kill_object(reshape107) + R.vm.kill_object(reshape108) + R.vm.kill_object(reshape109) + gv2208: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape110: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1770, gv2208, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1770) + gv2209: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape111: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape110, gv2209, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape110) + model_encoder_layers_13_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[205] + model_encoder_layers_13_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[206] + gv2210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1771: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2210, R.dtype("float16")) + _1769: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_13_self_attn_out_proj_weight, reshape111, model_encoder_layers_13_self_attn_out_proj_bias, alloc1771) + R.vm.kill_object(reshape111) + R.vm.kill_object(model_encoder_layers_13_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_13_self_attn_out_proj_bias) + gv2211: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1772: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2211, R.dtype("float16")) + cls.add4(alloc1765, alloc1771, alloc1772) + R.vm.kill_object(alloc1765) + R.vm.kill_object(alloc1771) + model_encoder_layers_13_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[213] + model_encoder_layers_13_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[214] + gv2212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1773: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2212, R.dtype("float16")) + cls.layer_norm1(alloc1772, model_encoder_layers_13_final_layer_norm_weight, model_encoder_layers_13_final_layer_norm_bias, alloc1773) + R.vm.kill_object(model_encoder_layers_13_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_13_final_layer_norm_bias) + model_encoder_layers_13_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[209] + model_encoder_layers_13_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[210] + gv2213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1774: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2213, R.dtype("float16")) + _1772: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_13_fc1_weight, alloc1773, model_encoder_layers_13_fc1_bias, alloc1774) + R.vm.kill_object(alloc1773) + R.vm.kill_object(model_encoder_layers_13_fc1_weight) + R.vm.kill_object(model_encoder_layers_13_fc1_bias) + model_encoder_layers_13_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[211] + model_encoder_layers_13_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[212] + gv2214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1775: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2214, R.dtype("float16")) + _1773: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_13_fc2_weight, alloc1774, model_encoder_layers_13_fc2_bias, alloc1775) + R.vm.kill_object(alloc1774) + R.vm.kill_object(model_encoder_layers_13_fc2_weight) + R.vm.kill_object(model_encoder_layers_13_fc2_bias) + gv2215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1776: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2215, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1772, alloc1775, alloc1776) + R.vm.kill_object(alloc1772) + R.vm.kill_object(alloc1775) + model_encoder_layers_14_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[222] + model_encoder_layers_14_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[223] + gv2216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1777: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2216, R.dtype("float16")) + cls.layer_norm1(alloc1776, model_encoder_layers_14_self_attn_layer_norm_weight, model_encoder_layers_14_self_attn_layer_norm_bias, alloc1777) + R.vm.kill_object(model_encoder_layers_14_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_14_self_attn_layer_norm_bias) + model_encoder_layers_14_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[218] + model_encoder_layers_14_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[219] + gv2217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1778: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2217, R.dtype("float16")) + _1776: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_q_proj_weight, alloc1777, model_encoder_layers_14_self_attn_q_proj_bias, alloc1778) + R.vm.kill_object(model_encoder_layers_14_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_14_self_attn_q_proj_bias) + gv2218: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape112: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1778, gv2218, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1778) + model_encoder_layers_14_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[215] + gv2219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1779: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2219, R.dtype("float16")) + _1777: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_14_self_attn_k_proj_weight, alloc1777, alloc1779) + R.vm.kill_object(model_encoder_layers_14_self_attn_k_proj_weight) + gv2220: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape113: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1779, gv2220, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1779) + model_encoder_layers_14_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[216] + model_encoder_layers_14_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[217] + gv2221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1780: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2221, R.dtype("float16")) + _1778: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_v_proj_weight, alloc1777, model_encoder_layers_14_self_attn_v_proj_bias, alloc1780) + R.vm.kill_object(alloc1777) + R.vm.kill_object(model_encoder_layers_14_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_14_self_attn_v_proj_bias) + gv2222: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape114: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1780, gv2222, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1780) + gv2223: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape115: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape112, gv2223, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape112) + gv2224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape116: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape113, gv2224, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape113) + gv2225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape117: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape114, gv2225, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape114) + gv2226: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1781: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2226, R.dtype("float16")) + _1779: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape115, reshape116, reshape117, alloc1781) + R.vm.kill_object(reshape115) + R.vm.kill_object(reshape116) + R.vm.kill_object(reshape117) + gv2227: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape118: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1781, gv2227, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1781) + gv2228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape119: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape118, gv2228, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape118) + model_encoder_layers_14_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[220] + model_encoder_layers_14_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[221] + gv2229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1782: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2229, R.dtype("float16")) + _1780: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_14_self_attn_out_proj_weight, reshape119, model_encoder_layers_14_self_attn_out_proj_bias, alloc1782) + R.vm.kill_object(reshape119) + R.vm.kill_object(model_encoder_layers_14_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_14_self_attn_out_proj_bias) + gv2230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1783: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2230, R.dtype("float16")) + cls.add4(alloc1776, alloc1782, alloc1783) + R.vm.kill_object(alloc1776) + R.vm.kill_object(alloc1782) + model_encoder_layers_14_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[228] + model_encoder_layers_14_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[229] + gv2231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1784: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2231, R.dtype("float16")) + cls.layer_norm1(alloc1783, model_encoder_layers_14_final_layer_norm_weight, model_encoder_layers_14_final_layer_norm_bias, alloc1784) + R.vm.kill_object(model_encoder_layers_14_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_14_final_layer_norm_bias) + model_encoder_layers_14_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[224] + model_encoder_layers_14_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[225] + gv2232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1785: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2232, R.dtype("float16")) + _1783: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_14_fc1_weight, alloc1784, model_encoder_layers_14_fc1_bias, alloc1785) + R.vm.kill_object(alloc1784) + R.vm.kill_object(model_encoder_layers_14_fc1_weight) + R.vm.kill_object(model_encoder_layers_14_fc1_bias) + model_encoder_layers_14_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[226] + model_encoder_layers_14_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[227] + gv2233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1786: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2233, R.dtype("float16")) + _1784: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_14_fc2_weight, alloc1785, model_encoder_layers_14_fc2_bias, alloc1786) + R.vm.kill_object(alloc1785) + R.vm.kill_object(model_encoder_layers_14_fc2_weight) + R.vm.kill_object(model_encoder_layers_14_fc2_bias) + gv2234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1787: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2234, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1783, alloc1786, alloc1787) + R.vm.kill_object(alloc1783) + R.vm.kill_object(alloc1786) + model_encoder_layers_15_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[237] + model_encoder_layers_15_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[238] + gv2235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1788: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2235, R.dtype("float16")) + cls.layer_norm1(alloc1787, model_encoder_layers_15_self_attn_layer_norm_weight, model_encoder_layers_15_self_attn_layer_norm_bias, alloc1788) + R.vm.kill_object(model_encoder_layers_15_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_15_self_attn_layer_norm_bias) + model_encoder_layers_15_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[233] + model_encoder_layers_15_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[234] + gv2236: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1789: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2236, R.dtype("float16")) + _1787: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_q_proj_weight, alloc1788, model_encoder_layers_15_self_attn_q_proj_bias, alloc1789) + R.vm.kill_object(model_encoder_layers_15_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_15_self_attn_q_proj_bias) + gv2237: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape120: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1789, gv2237, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1789) + model_encoder_layers_15_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[230] + gv2238: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1790: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2238, R.dtype("float16")) + _1788: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_15_self_attn_k_proj_weight, alloc1788, alloc1790) + R.vm.kill_object(model_encoder_layers_15_self_attn_k_proj_weight) + gv2239: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape121: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1790, gv2239, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1790) + model_encoder_layers_15_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[231] + model_encoder_layers_15_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[232] + gv2240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1791: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2240, R.dtype("float16")) + _1789: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_v_proj_weight, alloc1788, model_encoder_layers_15_self_attn_v_proj_bias, alloc1791) + R.vm.kill_object(alloc1788) + R.vm.kill_object(model_encoder_layers_15_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_15_self_attn_v_proj_bias) + gv2241: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape122: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1791, gv2241, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1791) + gv2242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape123: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape120, gv2242, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape120) + gv2243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape124: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape121, gv2243, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape121) + gv2244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape125: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape122, gv2244, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape122) + gv2245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1792: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2245, R.dtype("float16")) + _1790: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape123, reshape124, reshape125, alloc1792) + R.vm.kill_object(reshape123) + R.vm.kill_object(reshape124) + R.vm.kill_object(reshape125) + gv2246: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape126: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1792, gv2246, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1792) + gv2247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape127: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape126, gv2247, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape126) + model_encoder_layers_15_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[235] + model_encoder_layers_15_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[236] + gv2248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1793: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2248, R.dtype("float16")) + _1791: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_15_self_attn_out_proj_weight, reshape127, model_encoder_layers_15_self_attn_out_proj_bias, alloc1793) + R.vm.kill_object(reshape127) + R.vm.kill_object(model_encoder_layers_15_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_15_self_attn_out_proj_bias) + gv2249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1794: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2249, R.dtype("float16")) + cls.add4(alloc1787, alloc1793, alloc1794) + R.vm.kill_object(alloc1787) + R.vm.kill_object(alloc1793) + model_encoder_layers_15_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[243] + model_encoder_layers_15_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[244] + gv2250: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1795: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2250, R.dtype("float16")) + cls.layer_norm1(alloc1794, model_encoder_layers_15_final_layer_norm_weight, model_encoder_layers_15_final_layer_norm_bias, alloc1795) + R.vm.kill_object(model_encoder_layers_15_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_15_final_layer_norm_bias) + model_encoder_layers_15_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[239] + model_encoder_layers_15_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[240] + gv2251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1796: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2251, R.dtype("float16")) + _1794: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_15_fc1_weight, alloc1795, model_encoder_layers_15_fc1_bias, alloc1796) + R.vm.kill_object(alloc1795) + R.vm.kill_object(model_encoder_layers_15_fc1_weight) + R.vm.kill_object(model_encoder_layers_15_fc1_bias) + model_encoder_layers_15_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[241] + model_encoder_layers_15_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[242] + gv2252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1797: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2252, R.dtype("float16")) + _1795: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_15_fc2_weight, alloc1796, model_encoder_layers_15_fc2_bias, alloc1797) + R.vm.kill_object(alloc1796) + R.vm.kill_object(model_encoder_layers_15_fc2_weight) + R.vm.kill_object(model_encoder_layers_15_fc2_bias) + gv2253: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1798: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2253, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1794, alloc1797, alloc1798) + R.vm.kill_object(alloc1794) + R.vm.kill_object(alloc1797) + model_encoder_layers_16_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[252] + model_encoder_layers_16_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[253] + gv2254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1799: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2254, R.dtype("float16")) + cls.layer_norm1(alloc1798, model_encoder_layers_16_self_attn_layer_norm_weight, model_encoder_layers_16_self_attn_layer_norm_bias, alloc1799) + R.vm.kill_object(model_encoder_layers_16_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_16_self_attn_layer_norm_bias) + model_encoder_layers_16_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[248] + model_encoder_layers_16_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[249] + gv2255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1800: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2255, R.dtype("float16")) + _1798: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_q_proj_weight, alloc1799, model_encoder_layers_16_self_attn_q_proj_bias, alloc1800) + R.vm.kill_object(model_encoder_layers_16_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_16_self_attn_q_proj_bias) + gv2256: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape128: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1800, gv2256, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1800) + model_encoder_layers_16_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[245] + gv2257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1801: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2257, R.dtype("float16")) + _1799: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_16_self_attn_k_proj_weight, alloc1799, alloc1801) + R.vm.kill_object(model_encoder_layers_16_self_attn_k_proj_weight) + gv2258: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape129: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1801, gv2258, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1801) + model_encoder_layers_16_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[246] + model_encoder_layers_16_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[247] + gv2259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1802: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2259, R.dtype("float16")) + _1800: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_v_proj_weight, alloc1799, model_encoder_layers_16_self_attn_v_proj_bias, alloc1802) + R.vm.kill_object(alloc1799) + R.vm.kill_object(model_encoder_layers_16_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_16_self_attn_v_proj_bias) + gv2260: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape130: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1802, gv2260, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1802) + gv2261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape131: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape128, gv2261, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape128) + gv2262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape132: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape129, gv2262, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape129) + gv2263: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape133: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape130, gv2263, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape130) + gv2264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1803: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2264, R.dtype("float16")) + _1801: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape131, reshape132, reshape133, alloc1803) + R.vm.kill_object(reshape131) + R.vm.kill_object(reshape132) + R.vm.kill_object(reshape133) + gv2265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape134: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1803, gv2265, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1803) + gv2266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape135: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape134, gv2266, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape134) + model_encoder_layers_16_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[250] + model_encoder_layers_16_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[251] + gv2267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1804: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2267, R.dtype("float16")) + _1802: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_16_self_attn_out_proj_weight, reshape135, model_encoder_layers_16_self_attn_out_proj_bias, alloc1804) + R.vm.kill_object(reshape135) + R.vm.kill_object(model_encoder_layers_16_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_16_self_attn_out_proj_bias) + gv2268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1805: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2268, R.dtype("float16")) + cls.add4(alloc1798, alloc1804, alloc1805) + R.vm.kill_object(alloc1798) + R.vm.kill_object(alloc1804) + model_encoder_layers_16_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[258] + model_encoder_layers_16_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[259] + gv2269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1806: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2269, R.dtype("float16")) + cls.layer_norm1(alloc1805, model_encoder_layers_16_final_layer_norm_weight, model_encoder_layers_16_final_layer_norm_bias, alloc1806) + R.vm.kill_object(model_encoder_layers_16_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_16_final_layer_norm_bias) + model_encoder_layers_16_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[254] + model_encoder_layers_16_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[255] + gv2270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1807: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2270, R.dtype("float16")) + _1805: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_16_fc1_weight, alloc1806, model_encoder_layers_16_fc1_bias, alloc1807) + R.vm.kill_object(alloc1806) + R.vm.kill_object(model_encoder_layers_16_fc1_weight) + R.vm.kill_object(model_encoder_layers_16_fc1_bias) + model_encoder_layers_16_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[256] + model_encoder_layers_16_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[257] + gv2271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1808: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2271, R.dtype("float16")) + _1806: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_16_fc2_weight, alloc1807, model_encoder_layers_16_fc2_bias, alloc1808) + R.vm.kill_object(alloc1807) + R.vm.kill_object(model_encoder_layers_16_fc2_weight) + R.vm.kill_object(model_encoder_layers_16_fc2_bias) + gv2272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1809: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2272, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1805, alloc1808, alloc1809) + R.vm.kill_object(alloc1805) + R.vm.kill_object(alloc1808) + model_encoder_layers_17_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[267] + model_encoder_layers_17_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[268] + gv2273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1810: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2273, R.dtype("float16")) + cls.layer_norm1(alloc1809, model_encoder_layers_17_self_attn_layer_norm_weight, model_encoder_layers_17_self_attn_layer_norm_bias, alloc1810) + R.vm.kill_object(model_encoder_layers_17_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_17_self_attn_layer_norm_bias) + model_encoder_layers_17_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[263] + model_encoder_layers_17_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[264] + gv2274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1811: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2274, R.dtype("float16")) + _1809: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_q_proj_weight, alloc1810, model_encoder_layers_17_self_attn_q_proj_bias, alloc1811) + R.vm.kill_object(model_encoder_layers_17_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_17_self_attn_q_proj_bias) + gv2275: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape136: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1811, gv2275, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1811) + model_encoder_layers_17_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[260] + gv2276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1812: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2276, R.dtype("float16")) + _1810: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_17_self_attn_k_proj_weight, alloc1810, alloc1812) + R.vm.kill_object(model_encoder_layers_17_self_attn_k_proj_weight) + gv2277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape137: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1812, gv2277, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1812) + model_encoder_layers_17_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[261] + model_encoder_layers_17_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[262] + gv2278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1813: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2278, R.dtype("float16")) + _1811: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_v_proj_weight, alloc1810, model_encoder_layers_17_self_attn_v_proj_bias, alloc1813) + R.vm.kill_object(alloc1810) + R.vm.kill_object(model_encoder_layers_17_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_17_self_attn_v_proj_bias) + gv2279: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape138: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1813, gv2279, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1813) + gv2280: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape139: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape136, gv2280, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape136) + gv2281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape140: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape137, gv2281, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape137) + gv2282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape141: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape138, gv2282, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape138) + gv2283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1814: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2283, R.dtype("float16")) + _1812: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape139, reshape140, reshape141, alloc1814) + R.vm.kill_object(reshape139) + R.vm.kill_object(reshape140) + R.vm.kill_object(reshape141) + gv2284: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape142: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1814, gv2284, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1814) + gv2285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape143: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape142, gv2285, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape142) + model_encoder_layers_17_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[265] + model_encoder_layers_17_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[266] + gv2286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1815: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2286, R.dtype("float16")) + _1813: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_17_self_attn_out_proj_weight, reshape143, model_encoder_layers_17_self_attn_out_proj_bias, alloc1815) + R.vm.kill_object(reshape143) + R.vm.kill_object(model_encoder_layers_17_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_17_self_attn_out_proj_bias) + gv2287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1816: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2287, R.dtype("float16")) + cls.add4(alloc1809, alloc1815, alloc1816) + R.vm.kill_object(alloc1809) + R.vm.kill_object(alloc1815) + model_encoder_layers_17_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[273] + model_encoder_layers_17_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[274] + gv2288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1817: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2288, R.dtype("float16")) + cls.layer_norm1(alloc1816, model_encoder_layers_17_final_layer_norm_weight, model_encoder_layers_17_final_layer_norm_bias, alloc1817) + R.vm.kill_object(model_encoder_layers_17_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_17_final_layer_norm_bias) + model_encoder_layers_17_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[269] + model_encoder_layers_17_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[270] + gv2289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1818: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2289, R.dtype("float16")) + _1816: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_17_fc1_weight, alloc1817, model_encoder_layers_17_fc1_bias, alloc1818) + R.vm.kill_object(alloc1817) + R.vm.kill_object(model_encoder_layers_17_fc1_weight) + R.vm.kill_object(model_encoder_layers_17_fc1_bias) + model_encoder_layers_17_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[271] + model_encoder_layers_17_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[272] + gv2290: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1819: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2290, R.dtype("float16")) + _1817: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_17_fc2_weight, alloc1818, model_encoder_layers_17_fc2_bias, alloc1819) + R.vm.kill_object(alloc1818) + R.vm.kill_object(model_encoder_layers_17_fc2_weight) + R.vm.kill_object(model_encoder_layers_17_fc2_bias) + gv2291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1820: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2291, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1816, alloc1819, alloc1820) + R.vm.kill_object(alloc1816) + R.vm.kill_object(alloc1819) + model_encoder_layers_18_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[282] + model_encoder_layers_18_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[283] + gv2292: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1821: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2292, R.dtype("float16")) + cls.layer_norm1(alloc1820, model_encoder_layers_18_self_attn_layer_norm_weight, model_encoder_layers_18_self_attn_layer_norm_bias, alloc1821) + R.vm.kill_object(model_encoder_layers_18_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_18_self_attn_layer_norm_bias) + model_encoder_layers_18_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[278] + model_encoder_layers_18_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[279] + gv2293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1822: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2293, R.dtype("float16")) + _1820: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_q_proj_weight, alloc1821, model_encoder_layers_18_self_attn_q_proj_bias, alloc1822) + R.vm.kill_object(model_encoder_layers_18_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_18_self_attn_q_proj_bias) + gv2294: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape144: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1822, gv2294, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1822) + model_encoder_layers_18_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[275] + gv2295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1823: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2295, R.dtype("float16")) + _1821: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_18_self_attn_k_proj_weight, alloc1821, alloc1823) + R.vm.kill_object(model_encoder_layers_18_self_attn_k_proj_weight) + gv2296: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape145: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1823, gv2296, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1823) + model_encoder_layers_18_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[276] + model_encoder_layers_18_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[277] + gv2297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1824: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2297, R.dtype("float16")) + _1822: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_v_proj_weight, alloc1821, model_encoder_layers_18_self_attn_v_proj_bias, alloc1824) + R.vm.kill_object(alloc1821) + R.vm.kill_object(model_encoder_layers_18_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_18_self_attn_v_proj_bias) + gv2298: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape146: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1824, gv2298, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1824) + gv2299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape147: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape144, gv2299, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape144) + gv2300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape148: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape145, gv2300, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape145) + gv2301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape149: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape146, gv2301, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape146) + gv2302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1825: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2302, R.dtype("float16")) + _1823: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape147, reshape148, reshape149, alloc1825) + R.vm.kill_object(reshape147) + R.vm.kill_object(reshape148) + R.vm.kill_object(reshape149) + gv2303: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape150: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1825, gv2303, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1825) + gv2304: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape151: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape150, gv2304, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape150) + model_encoder_layers_18_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[280] + model_encoder_layers_18_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[281] + gv2305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1826: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2305, R.dtype("float16")) + _1824: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_18_self_attn_out_proj_weight, reshape151, model_encoder_layers_18_self_attn_out_proj_bias, alloc1826) + R.vm.kill_object(reshape151) + R.vm.kill_object(model_encoder_layers_18_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_18_self_attn_out_proj_bias) + gv2306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1827: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2306, R.dtype("float16")) + cls.add4(alloc1820, alloc1826, alloc1827) + R.vm.kill_object(alloc1820) + R.vm.kill_object(alloc1826) + model_encoder_layers_18_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[288] + model_encoder_layers_18_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[289] + gv2307: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1828: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2307, R.dtype("float16")) + cls.layer_norm1(alloc1827, model_encoder_layers_18_final_layer_norm_weight, model_encoder_layers_18_final_layer_norm_bias, alloc1828) + R.vm.kill_object(model_encoder_layers_18_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_18_final_layer_norm_bias) + model_encoder_layers_18_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[284] + model_encoder_layers_18_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[285] + gv2308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1829: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2308, R.dtype("float16")) + _1827: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_18_fc1_weight, alloc1828, model_encoder_layers_18_fc1_bias, alloc1829) + R.vm.kill_object(alloc1828) + R.vm.kill_object(model_encoder_layers_18_fc1_weight) + R.vm.kill_object(model_encoder_layers_18_fc1_bias) + model_encoder_layers_18_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[286] + model_encoder_layers_18_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[287] + gv2309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1830: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2309, R.dtype("float16")) + _1828: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_18_fc2_weight, alloc1829, model_encoder_layers_18_fc2_bias, alloc1830) + R.vm.kill_object(alloc1829) + R.vm.kill_object(model_encoder_layers_18_fc2_weight) + R.vm.kill_object(model_encoder_layers_18_fc2_bias) + gv2310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1831: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2310, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1827, alloc1830, alloc1831) + R.vm.kill_object(alloc1827) + R.vm.kill_object(alloc1830) + model_encoder_layers_19_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[297] + model_encoder_layers_19_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[298] + gv2311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1832: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2311, R.dtype("float16")) + cls.layer_norm1(alloc1831, model_encoder_layers_19_self_attn_layer_norm_weight, model_encoder_layers_19_self_attn_layer_norm_bias, alloc1832) + R.vm.kill_object(model_encoder_layers_19_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_19_self_attn_layer_norm_bias) + model_encoder_layers_19_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[293] + model_encoder_layers_19_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[294] + gv2312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1833: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2312, R.dtype("float16")) + _1831: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_q_proj_weight, alloc1832, model_encoder_layers_19_self_attn_q_proj_bias, alloc1833) + R.vm.kill_object(model_encoder_layers_19_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_19_self_attn_q_proj_bias) + gv2313: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape152: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1833, gv2313, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1833) + model_encoder_layers_19_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[290] + gv2314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1834: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2314, R.dtype("float16")) + _1832: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_19_self_attn_k_proj_weight, alloc1832, alloc1834) + R.vm.kill_object(model_encoder_layers_19_self_attn_k_proj_weight) + gv2315: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape153: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1834, gv2315, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1834) + model_encoder_layers_19_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[291] + model_encoder_layers_19_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[292] + gv2316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1835: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2316, R.dtype("float16")) + _1833: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_v_proj_weight, alloc1832, model_encoder_layers_19_self_attn_v_proj_bias, alloc1835) + R.vm.kill_object(alloc1832) + R.vm.kill_object(model_encoder_layers_19_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_19_self_attn_v_proj_bias) + gv2317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape154: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1835, gv2317, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1835) + gv2318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape155: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape152, gv2318, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape152) + gv2319: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape156: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape153, gv2319, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape153) + gv2320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape157: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape154, gv2320, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape154) + gv2321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1836: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2321, R.dtype("float16")) + _1834: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape155, reshape156, reshape157, alloc1836) + R.vm.kill_object(reshape155) + R.vm.kill_object(reshape156) + R.vm.kill_object(reshape157) + gv2322: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape158: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1836, gv2322, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1836) + gv2323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape159: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape158, gv2323, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape158) + model_encoder_layers_19_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[295] + model_encoder_layers_19_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[296] + gv2324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1837: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2324, R.dtype("float16")) + _1835: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_19_self_attn_out_proj_weight, reshape159, model_encoder_layers_19_self_attn_out_proj_bias, alloc1837) + R.vm.kill_object(reshape159) + R.vm.kill_object(model_encoder_layers_19_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_19_self_attn_out_proj_bias) + gv2325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1838: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2325, R.dtype("float16")) + cls.add4(alloc1831, alloc1837, alloc1838) + R.vm.kill_object(alloc1831) + R.vm.kill_object(alloc1837) + model_encoder_layers_19_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[303] + model_encoder_layers_19_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[304] + gv2326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1839: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2326, R.dtype("float16")) + cls.layer_norm1(alloc1838, model_encoder_layers_19_final_layer_norm_weight, model_encoder_layers_19_final_layer_norm_bias, alloc1839) + R.vm.kill_object(model_encoder_layers_19_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_19_final_layer_norm_bias) + model_encoder_layers_19_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[299] + model_encoder_layers_19_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[300] + gv2327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1840: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2327, R.dtype("float16")) + _1838: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_19_fc1_weight, alloc1839, model_encoder_layers_19_fc1_bias, alloc1840) + R.vm.kill_object(alloc1839) + R.vm.kill_object(model_encoder_layers_19_fc1_weight) + R.vm.kill_object(model_encoder_layers_19_fc1_bias) + model_encoder_layers_19_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[301] + model_encoder_layers_19_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[302] + gv2328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1841: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2328, R.dtype("float16")) + _1839: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_19_fc2_weight, alloc1840, model_encoder_layers_19_fc2_bias, alloc1841) + R.vm.kill_object(alloc1840) + R.vm.kill_object(model_encoder_layers_19_fc2_weight) + R.vm.kill_object(model_encoder_layers_19_fc2_bias) + gv2329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1842: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2329, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1838, alloc1841, alloc1842) + R.vm.kill_object(alloc1838) + R.vm.kill_object(alloc1841) + model_encoder_layers_20_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[312] + model_encoder_layers_20_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[313] + gv2330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1843: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2330, R.dtype("float16")) + cls.layer_norm1(alloc1842, model_encoder_layers_20_self_attn_layer_norm_weight, model_encoder_layers_20_self_attn_layer_norm_bias, alloc1843) + R.vm.kill_object(model_encoder_layers_20_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_20_self_attn_layer_norm_bias) + model_encoder_layers_20_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[308] + model_encoder_layers_20_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[309] + gv2331: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1844: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2331, R.dtype("float16")) + _1842: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_q_proj_weight, alloc1843, model_encoder_layers_20_self_attn_q_proj_bias, alloc1844) + R.vm.kill_object(model_encoder_layers_20_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_20_self_attn_q_proj_bias) + gv2332: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape160: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1844, gv2332, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1844) + model_encoder_layers_20_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[305] + gv2333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1845: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2333, R.dtype("float16")) + _1843: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_20_self_attn_k_proj_weight, alloc1843, alloc1845) + R.vm.kill_object(model_encoder_layers_20_self_attn_k_proj_weight) + gv2334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape161: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1845, gv2334, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1845) + model_encoder_layers_20_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[306] + model_encoder_layers_20_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[307] + gv2335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1846: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2335, R.dtype("float16")) + _1844: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_v_proj_weight, alloc1843, model_encoder_layers_20_self_attn_v_proj_bias, alloc1846) + R.vm.kill_object(alloc1843) + R.vm.kill_object(model_encoder_layers_20_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_20_self_attn_v_proj_bias) + gv2336: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape162: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1846, gv2336, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1846) + gv2337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape163: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape160, gv2337, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape160) + gv2338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape164: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape161, gv2338, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape161) + gv2339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape165: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape162, gv2339, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape162) + gv2340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1847: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2340, R.dtype("float16")) + _1845: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape163, reshape164, reshape165, alloc1847) + R.vm.kill_object(reshape163) + R.vm.kill_object(reshape164) + R.vm.kill_object(reshape165) + gv2341: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape166: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1847, gv2341, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1847) + gv2342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape167: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape166, gv2342, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape166) + model_encoder_layers_20_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[310] + model_encoder_layers_20_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[311] + gv2343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1848: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2343, R.dtype("float16")) + _1846: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_20_self_attn_out_proj_weight, reshape167, model_encoder_layers_20_self_attn_out_proj_bias, alloc1848) + R.vm.kill_object(reshape167) + R.vm.kill_object(model_encoder_layers_20_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_20_self_attn_out_proj_bias) + gv2344: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1849: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2344, R.dtype("float16")) + cls.add4(alloc1842, alloc1848, alloc1849) + R.vm.kill_object(alloc1842) + R.vm.kill_object(alloc1848) + model_encoder_layers_20_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[318] + model_encoder_layers_20_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[319] + gv2345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1850: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2345, R.dtype("float16")) + cls.layer_norm1(alloc1849, model_encoder_layers_20_final_layer_norm_weight, model_encoder_layers_20_final_layer_norm_bias, alloc1850) + R.vm.kill_object(model_encoder_layers_20_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_20_final_layer_norm_bias) + model_encoder_layers_20_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[314] + model_encoder_layers_20_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[315] + gv2346: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1851: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2346, R.dtype("float16")) + _1849: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_20_fc1_weight, alloc1850, model_encoder_layers_20_fc1_bias, alloc1851) + R.vm.kill_object(alloc1850) + R.vm.kill_object(model_encoder_layers_20_fc1_weight) + R.vm.kill_object(model_encoder_layers_20_fc1_bias) + model_encoder_layers_20_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[316] + model_encoder_layers_20_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[317] + gv2347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1852: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2347, R.dtype("float16")) + _1850: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_20_fc2_weight, alloc1851, model_encoder_layers_20_fc2_bias, alloc1852) + R.vm.kill_object(alloc1851) + R.vm.kill_object(model_encoder_layers_20_fc2_weight) + R.vm.kill_object(model_encoder_layers_20_fc2_bias) + gv2348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1853: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2348, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1849, alloc1852, alloc1853) + R.vm.kill_object(alloc1849) + R.vm.kill_object(alloc1852) + model_encoder_layers_21_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[327] + model_encoder_layers_21_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[328] + gv2349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1854: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2349, R.dtype("float16")) + cls.layer_norm1(alloc1853, model_encoder_layers_21_self_attn_layer_norm_weight, model_encoder_layers_21_self_attn_layer_norm_bias, alloc1854) + R.vm.kill_object(model_encoder_layers_21_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_21_self_attn_layer_norm_bias) + model_encoder_layers_21_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[323] + model_encoder_layers_21_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[324] + gv2350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1855: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2350, R.dtype("float16")) + _1853: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_q_proj_weight, alloc1854, model_encoder_layers_21_self_attn_q_proj_bias, alloc1855) + R.vm.kill_object(model_encoder_layers_21_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_21_self_attn_q_proj_bias) + gv2351: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape168: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1855, gv2351, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1855) + model_encoder_layers_21_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[320] + gv2352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1856: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2352, R.dtype("float16")) + _1854: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_21_self_attn_k_proj_weight, alloc1854, alloc1856) + R.vm.kill_object(model_encoder_layers_21_self_attn_k_proj_weight) + gv2353: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape169: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1856, gv2353, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1856) + model_encoder_layers_21_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[321] + model_encoder_layers_21_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[322] + gv2354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1857: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2354, R.dtype("float16")) + _1855: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_v_proj_weight, alloc1854, model_encoder_layers_21_self_attn_v_proj_bias, alloc1857) + R.vm.kill_object(alloc1854) + R.vm.kill_object(model_encoder_layers_21_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_21_self_attn_v_proj_bias) + gv2355: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape170: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1857, gv2355, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1857) + gv2356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape171: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape168, gv2356, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape168) + gv2357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape172: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape169, gv2357, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape169) + gv2358: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape173: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape170, gv2358, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape170) + gv2359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1858: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2359, R.dtype("float16")) + _1856: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape171, reshape172, reshape173, alloc1858) + R.vm.kill_object(reshape171) + R.vm.kill_object(reshape172) + R.vm.kill_object(reshape173) + gv2360: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape174: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1858, gv2360, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1858) + gv2361: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape175: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape174, gv2361, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape174) + model_encoder_layers_21_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[325] + model_encoder_layers_21_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[326] + gv2362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1859: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2362, R.dtype("float16")) + _1857: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_21_self_attn_out_proj_weight, reshape175, model_encoder_layers_21_self_attn_out_proj_bias, alloc1859) + R.vm.kill_object(reshape175) + R.vm.kill_object(model_encoder_layers_21_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_21_self_attn_out_proj_bias) + gv2363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1860: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2363, R.dtype("float16")) + cls.add4(alloc1853, alloc1859, alloc1860) + R.vm.kill_object(alloc1853) + R.vm.kill_object(alloc1859) + model_encoder_layers_21_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[333] + model_encoder_layers_21_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[334] + gv2364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1861: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2364, R.dtype("float16")) + cls.layer_norm1(alloc1860, model_encoder_layers_21_final_layer_norm_weight, model_encoder_layers_21_final_layer_norm_bias, alloc1861) + R.vm.kill_object(model_encoder_layers_21_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_21_final_layer_norm_bias) + model_encoder_layers_21_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[329] + model_encoder_layers_21_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[330] + gv2365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1862: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2365, R.dtype("float16")) + _1860: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_21_fc1_weight, alloc1861, model_encoder_layers_21_fc1_bias, alloc1862) + R.vm.kill_object(alloc1861) + R.vm.kill_object(model_encoder_layers_21_fc1_weight) + R.vm.kill_object(model_encoder_layers_21_fc1_bias) + model_encoder_layers_21_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[331] + model_encoder_layers_21_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[332] + gv2366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1863: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2366, R.dtype("float16")) + _1861: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_21_fc2_weight, alloc1862, model_encoder_layers_21_fc2_bias, alloc1863) + R.vm.kill_object(alloc1862) + R.vm.kill_object(model_encoder_layers_21_fc2_weight) + R.vm.kill_object(model_encoder_layers_21_fc2_bias) + gv2367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1864: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2367, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1860, alloc1863, alloc1864) + R.vm.kill_object(alloc1860) + R.vm.kill_object(alloc1863) + model_encoder_layers_22_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[342] + model_encoder_layers_22_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[343] + gv2368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1865: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2368, R.dtype("float16")) + cls.layer_norm1(alloc1864, model_encoder_layers_22_self_attn_layer_norm_weight, model_encoder_layers_22_self_attn_layer_norm_bias, alloc1865) + R.vm.kill_object(model_encoder_layers_22_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_22_self_attn_layer_norm_bias) + model_encoder_layers_22_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[338] + model_encoder_layers_22_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[339] + gv2369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1866: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2369, R.dtype("float16")) + _1864: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_q_proj_weight, alloc1865, model_encoder_layers_22_self_attn_q_proj_bias, alloc1866) + R.vm.kill_object(model_encoder_layers_22_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_22_self_attn_q_proj_bias) + gv2370: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape176: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1866, gv2370, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1866) + model_encoder_layers_22_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[335] + gv2371: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1867: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2371, R.dtype("float16")) + _1865: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_22_self_attn_k_proj_weight, alloc1865, alloc1867) + R.vm.kill_object(model_encoder_layers_22_self_attn_k_proj_weight) + gv2372: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape177: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1867, gv2372, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1867) + model_encoder_layers_22_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[336] + model_encoder_layers_22_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[337] + gv2373: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1868: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2373, R.dtype("float16")) + _1866: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_v_proj_weight, alloc1865, model_encoder_layers_22_self_attn_v_proj_bias, alloc1868) + R.vm.kill_object(alloc1865) + R.vm.kill_object(model_encoder_layers_22_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_22_self_attn_v_proj_bias) + gv2374: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape178: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1868, gv2374, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1868) + gv2375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape179: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape176, gv2375, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape176) + gv2376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape180: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape177, gv2376, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape177) + gv2377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape181: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape178, gv2377, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape178) + gv2378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1869: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2378, R.dtype("float16")) + _1867: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape179, reshape180, reshape181, alloc1869) + R.vm.kill_object(reshape179) + R.vm.kill_object(reshape180) + R.vm.kill_object(reshape181) + gv2379: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape182: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1869, gv2379, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1869) + gv2380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape183: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape182, gv2380, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape182) + model_encoder_layers_22_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[340] + model_encoder_layers_22_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[341] + gv2381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1870: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2381, R.dtype("float16")) + _1868: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_22_self_attn_out_proj_weight, reshape183, model_encoder_layers_22_self_attn_out_proj_bias, alloc1870) + R.vm.kill_object(reshape183) + R.vm.kill_object(model_encoder_layers_22_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_22_self_attn_out_proj_bias) + gv2382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1871: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2382, R.dtype("float16")) + cls.add4(alloc1864, alloc1870, alloc1871) + R.vm.kill_object(alloc1864) + R.vm.kill_object(alloc1870) + model_encoder_layers_22_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[348] + model_encoder_layers_22_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[349] + gv2383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1872: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2383, R.dtype("float16")) + cls.layer_norm1(alloc1871, model_encoder_layers_22_final_layer_norm_weight, model_encoder_layers_22_final_layer_norm_bias, alloc1872) + R.vm.kill_object(model_encoder_layers_22_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_22_final_layer_norm_bias) + model_encoder_layers_22_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[344] + model_encoder_layers_22_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[345] + gv2384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1873: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2384, R.dtype("float16")) + _1871: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_22_fc1_weight, alloc1872, model_encoder_layers_22_fc1_bias, alloc1873) + R.vm.kill_object(alloc1872) + R.vm.kill_object(model_encoder_layers_22_fc1_weight) + R.vm.kill_object(model_encoder_layers_22_fc1_bias) + model_encoder_layers_22_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[346] + model_encoder_layers_22_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[347] + gv2385: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1874: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2385, R.dtype("float16")) + _1872: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_22_fc2_weight, alloc1873, model_encoder_layers_22_fc2_bias, alloc1874) + R.vm.kill_object(alloc1873) + R.vm.kill_object(model_encoder_layers_22_fc2_weight) + R.vm.kill_object(model_encoder_layers_22_fc2_bias) + gv2386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1875: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2386, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1871, alloc1874, alloc1875) + R.vm.kill_object(alloc1871) + R.vm.kill_object(alloc1874) + model_encoder_layers_23_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[357] + model_encoder_layers_23_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[358] + gv2387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1876: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2387, R.dtype("float16")) + cls.layer_norm1(alloc1875, model_encoder_layers_23_self_attn_layer_norm_weight, model_encoder_layers_23_self_attn_layer_norm_bias, alloc1876) + R.vm.kill_object(model_encoder_layers_23_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_23_self_attn_layer_norm_bias) + model_encoder_layers_23_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[353] + model_encoder_layers_23_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[354] + gv2388: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1877: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2388, R.dtype("float16")) + _1875: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_q_proj_weight, alloc1876, model_encoder_layers_23_self_attn_q_proj_bias, alloc1877) + R.vm.kill_object(model_encoder_layers_23_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_23_self_attn_q_proj_bias) + gv2389: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape184: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1877, gv2389, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1877) + model_encoder_layers_23_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[350] + gv2390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1878: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2390, R.dtype("float16")) + _1876: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_23_self_attn_k_proj_weight, alloc1876, alloc1878) + R.vm.kill_object(model_encoder_layers_23_self_attn_k_proj_weight) + gv2391: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape185: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1878, gv2391, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1878) + model_encoder_layers_23_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[351] + model_encoder_layers_23_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[352] + gv2392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1879: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2392, R.dtype("float16")) + _1877: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_v_proj_weight, alloc1876, model_encoder_layers_23_self_attn_v_proj_bias, alloc1879) + R.vm.kill_object(alloc1876) + R.vm.kill_object(model_encoder_layers_23_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_23_self_attn_v_proj_bias) + gv2393: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape186: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1879, gv2393, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1879) + gv2394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape187: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape184, gv2394, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape184) + gv2395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape188: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape185, gv2395, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape185) + gv2396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape189: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape186, gv2396, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape186) + gv2397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1880: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2397, R.dtype("float16")) + _1878: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape187, reshape188, reshape189, alloc1880) + R.vm.kill_object(reshape187) + R.vm.kill_object(reshape188) + R.vm.kill_object(reshape189) + gv2398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape190: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1880, gv2398, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1880) + gv2399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape191: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape190, gv2399, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape190) + model_encoder_layers_23_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[355] + model_encoder_layers_23_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[356] + gv2400: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1881: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2400, R.dtype("float16")) + _1879: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_23_self_attn_out_proj_weight, reshape191, model_encoder_layers_23_self_attn_out_proj_bias, alloc1881) + R.vm.kill_object(reshape191) + R.vm.kill_object(model_encoder_layers_23_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_23_self_attn_out_proj_bias) + gv2401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1882: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2401, R.dtype("float16")) + cls.add4(alloc1875, alloc1881, alloc1882) + R.vm.kill_object(alloc1875) + R.vm.kill_object(alloc1881) + model_encoder_layers_23_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[363] + model_encoder_layers_23_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[364] + gv2402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1883: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2402, R.dtype("float16")) + cls.layer_norm1(alloc1882, model_encoder_layers_23_final_layer_norm_weight, model_encoder_layers_23_final_layer_norm_bias, alloc1883) + R.vm.kill_object(model_encoder_layers_23_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_23_final_layer_norm_bias) + model_encoder_layers_23_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[359] + model_encoder_layers_23_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[360] + gv2403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1884: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2403, R.dtype("float16")) + _1882: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_23_fc1_weight, alloc1883, model_encoder_layers_23_fc1_bias, alloc1884) + R.vm.kill_object(alloc1883) + R.vm.kill_object(model_encoder_layers_23_fc1_weight) + R.vm.kill_object(model_encoder_layers_23_fc1_bias) + model_encoder_layers_23_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[361] + model_encoder_layers_23_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[362] + gv2404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1885: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2404, R.dtype("float16")) + _1883: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_23_fc2_weight, alloc1884, model_encoder_layers_23_fc2_bias, alloc1885) + R.vm.kill_object(alloc1884) + R.vm.kill_object(model_encoder_layers_23_fc2_weight) + R.vm.kill_object(model_encoder_layers_23_fc2_bias) + gv2405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1886: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2405, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1882, alloc1885, alloc1886) + R.vm.kill_object(alloc1882) + R.vm.kill_object(alloc1885) + model_encoder_layers_24_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[372] + model_encoder_layers_24_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[373] + gv2406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1887: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2406, R.dtype("float16")) + cls.layer_norm1(alloc1886, model_encoder_layers_24_self_attn_layer_norm_weight, model_encoder_layers_24_self_attn_layer_norm_bias, alloc1887) + R.vm.kill_object(model_encoder_layers_24_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_24_self_attn_layer_norm_bias) + model_encoder_layers_24_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[368] + model_encoder_layers_24_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[369] + gv2407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1888: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2407, R.dtype("float16")) + _1886: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_q_proj_weight, alloc1887, model_encoder_layers_24_self_attn_q_proj_bias, alloc1888) + R.vm.kill_object(model_encoder_layers_24_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_24_self_attn_q_proj_bias) + gv2408: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape192: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1888, gv2408, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1888) + model_encoder_layers_24_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[365] + gv2409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1889: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2409, R.dtype("float16")) + _1887: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_24_self_attn_k_proj_weight, alloc1887, alloc1889) + R.vm.kill_object(model_encoder_layers_24_self_attn_k_proj_weight) + gv2410: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape193: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1889, gv2410, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1889) + model_encoder_layers_24_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[366] + model_encoder_layers_24_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[367] + gv2411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1890: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2411, R.dtype("float16")) + _1888: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_v_proj_weight, alloc1887, model_encoder_layers_24_self_attn_v_proj_bias, alloc1890) + R.vm.kill_object(alloc1887) + R.vm.kill_object(model_encoder_layers_24_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_24_self_attn_v_proj_bias) + gv2412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape194: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1890, gv2412, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1890) + gv2413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape195: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape192, gv2413, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape192) + gv2414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape196: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape193, gv2414, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape193) + gv2415: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape197: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape194, gv2415, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape194) + gv2416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1891: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2416, R.dtype("float16")) + _1889: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape195, reshape196, reshape197, alloc1891) + R.vm.kill_object(reshape195) + R.vm.kill_object(reshape196) + R.vm.kill_object(reshape197) + gv2417: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape198: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1891, gv2417, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1891) + gv2418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape199: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape198, gv2418, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape198) + model_encoder_layers_24_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[370] + model_encoder_layers_24_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[371] + gv2419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1892: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2419, R.dtype("float16")) + _1890: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_24_self_attn_out_proj_weight, reshape199, model_encoder_layers_24_self_attn_out_proj_bias, alloc1892) + R.vm.kill_object(reshape199) + R.vm.kill_object(model_encoder_layers_24_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_24_self_attn_out_proj_bias) + gv2420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1893: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2420, R.dtype("float16")) + cls.add4(alloc1886, alloc1892, alloc1893) + R.vm.kill_object(alloc1886) + R.vm.kill_object(alloc1892) + model_encoder_layers_24_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[378] + model_encoder_layers_24_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[379] + gv2421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1894: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2421, R.dtype("float16")) + cls.layer_norm1(alloc1893, model_encoder_layers_24_final_layer_norm_weight, model_encoder_layers_24_final_layer_norm_bias, alloc1894) + R.vm.kill_object(model_encoder_layers_24_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_24_final_layer_norm_bias) + model_encoder_layers_24_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[374] + model_encoder_layers_24_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[375] + gv2422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1895: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2422, R.dtype("float16")) + _1893: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_24_fc1_weight, alloc1894, model_encoder_layers_24_fc1_bias, alloc1895) + R.vm.kill_object(alloc1894) + R.vm.kill_object(model_encoder_layers_24_fc1_weight) + R.vm.kill_object(model_encoder_layers_24_fc1_bias) + model_encoder_layers_24_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[376] + model_encoder_layers_24_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[377] + gv2423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1896: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2423, R.dtype("float16")) + _1894: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_24_fc2_weight, alloc1895, model_encoder_layers_24_fc2_bias, alloc1896) + R.vm.kill_object(alloc1895) + R.vm.kill_object(model_encoder_layers_24_fc2_weight) + R.vm.kill_object(model_encoder_layers_24_fc2_bias) + gv2424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1897: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2424, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1893, alloc1896, alloc1897) + R.vm.kill_object(alloc1893) + R.vm.kill_object(alloc1896) + model_encoder_layers_25_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[387] + model_encoder_layers_25_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[388] + gv2425: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1898: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2425, R.dtype("float16")) + cls.layer_norm1(alloc1897, model_encoder_layers_25_self_attn_layer_norm_weight, model_encoder_layers_25_self_attn_layer_norm_bias, alloc1898) + R.vm.kill_object(model_encoder_layers_25_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_25_self_attn_layer_norm_bias) + model_encoder_layers_25_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[383] + model_encoder_layers_25_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[384] + gv2426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1899: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2426, R.dtype("float16")) + _1897: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_q_proj_weight, alloc1898, model_encoder_layers_25_self_attn_q_proj_bias, alloc1899) + R.vm.kill_object(model_encoder_layers_25_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_25_self_attn_q_proj_bias) + gv2427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape200: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1899, gv2427, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1899) + model_encoder_layers_25_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[380] + gv2428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1900: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2428, R.dtype("float16")) + _1898: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_25_self_attn_k_proj_weight, alloc1898, alloc1900) + R.vm.kill_object(model_encoder_layers_25_self_attn_k_proj_weight) + gv2429: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape201: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1900, gv2429, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1900) + model_encoder_layers_25_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[381] + model_encoder_layers_25_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[382] + gv2430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1901: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2430, R.dtype("float16")) + _1899: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_v_proj_weight, alloc1898, model_encoder_layers_25_self_attn_v_proj_bias, alloc1901) + R.vm.kill_object(alloc1898) + R.vm.kill_object(model_encoder_layers_25_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_25_self_attn_v_proj_bias) + gv2431: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape202: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1901, gv2431, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1901) + gv2432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape203: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape200, gv2432, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape200) + gv2433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape204: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape201, gv2433, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape201) + gv2434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape205: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape202, gv2434, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape202) + gv2435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1902: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2435, R.dtype("float16")) + _1900: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape203, reshape204, reshape205, alloc1902) + R.vm.kill_object(reshape203) + R.vm.kill_object(reshape204) + R.vm.kill_object(reshape205) + gv2436: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape206: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1902, gv2436, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1902) + gv2437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape207: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape206, gv2437, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape206) + model_encoder_layers_25_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[385] + model_encoder_layers_25_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[386] + gv2438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1903: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2438, R.dtype("float16")) + _1901: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_25_self_attn_out_proj_weight, reshape207, model_encoder_layers_25_self_attn_out_proj_bias, alloc1903) + R.vm.kill_object(reshape207) + R.vm.kill_object(model_encoder_layers_25_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_25_self_attn_out_proj_bias) + gv2439: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1904: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2439, R.dtype("float16")) + cls.add4(alloc1897, alloc1903, alloc1904) + R.vm.kill_object(alloc1897) + R.vm.kill_object(alloc1903) + model_encoder_layers_25_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[393] + model_encoder_layers_25_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[394] + gv2440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1905: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2440, R.dtype("float16")) + cls.layer_norm1(alloc1904, model_encoder_layers_25_final_layer_norm_weight, model_encoder_layers_25_final_layer_norm_bias, alloc1905) + R.vm.kill_object(model_encoder_layers_25_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_25_final_layer_norm_bias) + model_encoder_layers_25_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[389] + model_encoder_layers_25_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[390] + gv2441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1906: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2441, R.dtype("float16")) + _1904: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_25_fc1_weight, alloc1905, model_encoder_layers_25_fc1_bias, alloc1906) + R.vm.kill_object(alloc1905) + R.vm.kill_object(model_encoder_layers_25_fc1_weight) + R.vm.kill_object(model_encoder_layers_25_fc1_bias) + model_encoder_layers_25_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[391] + model_encoder_layers_25_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[392] + gv2442: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1907: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2442, R.dtype("float16")) + _1905: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_25_fc2_weight, alloc1906, model_encoder_layers_25_fc2_bias, alloc1907) + R.vm.kill_object(alloc1906) + R.vm.kill_object(model_encoder_layers_25_fc2_weight) + R.vm.kill_object(model_encoder_layers_25_fc2_bias) + gv2443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1908: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2443, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1904, alloc1907, alloc1908) + R.vm.kill_object(alloc1904) + R.vm.kill_object(alloc1907) + model_encoder_layers_26_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[402] + model_encoder_layers_26_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[403] + gv2444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1909: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2444, R.dtype("float16")) + cls.layer_norm1(alloc1908, model_encoder_layers_26_self_attn_layer_norm_weight, model_encoder_layers_26_self_attn_layer_norm_bias, alloc1909) + R.vm.kill_object(model_encoder_layers_26_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_26_self_attn_layer_norm_bias) + model_encoder_layers_26_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[398] + model_encoder_layers_26_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[399] + gv2445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1910: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2445, R.dtype("float16")) + _1908: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_q_proj_weight, alloc1909, model_encoder_layers_26_self_attn_q_proj_bias, alloc1910) + R.vm.kill_object(model_encoder_layers_26_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_26_self_attn_q_proj_bias) + gv2446: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape208: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1910, gv2446, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1910) + model_encoder_layers_26_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[395] + gv2447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1911: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2447, R.dtype("float16")) + _1909: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_26_self_attn_k_proj_weight, alloc1909, alloc1911) + R.vm.kill_object(model_encoder_layers_26_self_attn_k_proj_weight) + gv2448: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape209: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1911, gv2448, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1911) + model_encoder_layers_26_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[396] + model_encoder_layers_26_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[397] + gv2449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1912: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2449, R.dtype("float16")) + _1910: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_v_proj_weight, alloc1909, model_encoder_layers_26_self_attn_v_proj_bias, alloc1912) + R.vm.kill_object(alloc1909) + R.vm.kill_object(model_encoder_layers_26_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_26_self_attn_v_proj_bias) + gv2450: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape210: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1912, gv2450, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1912) + gv2451: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape211: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape208, gv2451, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape208) + gv2452: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape212: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape209, gv2452, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape209) + gv2453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape213: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape210, gv2453, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape210) + gv2454: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1913: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2454, R.dtype("float16")) + _1911: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape211, reshape212, reshape213, alloc1913) + R.vm.kill_object(reshape211) + R.vm.kill_object(reshape212) + R.vm.kill_object(reshape213) + gv2455: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape214: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1913, gv2455, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1913) + gv2456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape215: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape214, gv2456, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape214) + model_encoder_layers_26_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[400] + model_encoder_layers_26_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[401] + gv2457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1914: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2457, R.dtype("float16")) + _1912: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_26_self_attn_out_proj_weight, reshape215, model_encoder_layers_26_self_attn_out_proj_bias, alloc1914) + R.vm.kill_object(reshape215) + R.vm.kill_object(model_encoder_layers_26_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_26_self_attn_out_proj_bias) + gv2458: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1915: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2458, R.dtype("float16")) + cls.add4(alloc1908, alloc1914, alloc1915) + R.vm.kill_object(alloc1908) + R.vm.kill_object(alloc1914) + model_encoder_layers_26_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[408] + model_encoder_layers_26_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[409] + gv2459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1916: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2459, R.dtype("float16")) + cls.layer_norm1(alloc1915, model_encoder_layers_26_final_layer_norm_weight, model_encoder_layers_26_final_layer_norm_bias, alloc1916) + R.vm.kill_object(model_encoder_layers_26_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_26_final_layer_norm_bias) + model_encoder_layers_26_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[404] + model_encoder_layers_26_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[405] + gv2460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1917: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2460, R.dtype("float16")) + _1915: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_26_fc1_weight, alloc1916, model_encoder_layers_26_fc1_bias, alloc1917) + R.vm.kill_object(alloc1916) + R.vm.kill_object(model_encoder_layers_26_fc1_weight) + R.vm.kill_object(model_encoder_layers_26_fc1_bias) + model_encoder_layers_26_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[406] + model_encoder_layers_26_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[407] + gv2461: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1918: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2461, R.dtype("float16")) + _1916: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_26_fc2_weight, alloc1917, model_encoder_layers_26_fc2_bias, alloc1918) + R.vm.kill_object(alloc1917) + R.vm.kill_object(model_encoder_layers_26_fc2_weight) + R.vm.kill_object(model_encoder_layers_26_fc2_bias) + gv2462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1919: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2462, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1915, alloc1918, alloc1919) + R.vm.kill_object(alloc1915) + R.vm.kill_object(alloc1918) + model_encoder_layers_27_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[417] + model_encoder_layers_27_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[418] + gv2463: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1920: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2463, R.dtype("float16")) + cls.layer_norm1(alloc1919, model_encoder_layers_27_self_attn_layer_norm_weight, model_encoder_layers_27_self_attn_layer_norm_bias, alloc1920) + R.vm.kill_object(model_encoder_layers_27_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_27_self_attn_layer_norm_bias) + model_encoder_layers_27_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[413] + model_encoder_layers_27_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[414] + gv2464: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1921: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2464, R.dtype("float16")) + _1919: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_q_proj_weight, alloc1920, model_encoder_layers_27_self_attn_q_proj_bias, alloc1921) + R.vm.kill_object(model_encoder_layers_27_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_27_self_attn_q_proj_bias) + gv2465: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape216: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1921, gv2465, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1921) + model_encoder_layers_27_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[410] + gv2466: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1922: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2466, R.dtype("float16")) + _1920: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_27_self_attn_k_proj_weight, alloc1920, alloc1922) + R.vm.kill_object(model_encoder_layers_27_self_attn_k_proj_weight) + gv2467: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape217: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1922, gv2467, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1922) + model_encoder_layers_27_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[411] + model_encoder_layers_27_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[412] + gv2468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1923: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2468, R.dtype("float16")) + _1921: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_v_proj_weight, alloc1920, model_encoder_layers_27_self_attn_v_proj_bias, alloc1923) + R.vm.kill_object(alloc1920) + R.vm.kill_object(model_encoder_layers_27_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_27_self_attn_v_proj_bias) + gv2469: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape218: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1923, gv2469, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1923) + gv2470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape219: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape216, gv2470, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape216) + gv2471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape220: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape217, gv2471, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape217) + gv2472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape221: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape218, gv2472, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape218) + gv2473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1924: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2473, R.dtype("float16")) + _1922: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape219, reshape220, reshape221, alloc1924) + R.vm.kill_object(reshape219) + R.vm.kill_object(reshape220) + R.vm.kill_object(reshape221) + gv2474: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape222: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1924, gv2474, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1924) + gv2475: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape223: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape222, gv2475, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape222) + model_encoder_layers_27_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[415] + model_encoder_layers_27_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[416] + gv2476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1925: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2476, R.dtype("float16")) + _1923: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_27_self_attn_out_proj_weight, reshape223, model_encoder_layers_27_self_attn_out_proj_bias, alloc1925) + R.vm.kill_object(reshape223) + R.vm.kill_object(model_encoder_layers_27_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_27_self_attn_out_proj_bias) + gv2477: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1926: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2477, R.dtype("float16")) + cls.add4(alloc1919, alloc1925, alloc1926) + R.vm.kill_object(alloc1919) + R.vm.kill_object(alloc1925) + model_encoder_layers_27_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[423] + model_encoder_layers_27_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[424] + gv2478: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1927: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2478, R.dtype("float16")) + cls.layer_norm1(alloc1926, model_encoder_layers_27_final_layer_norm_weight, model_encoder_layers_27_final_layer_norm_bias, alloc1927) + R.vm.kill_object(model_encoder_layers_27_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_27_final_layer_norm_bias) + model_encoder_layers_27_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[419] + model_encoder_layers_27_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[420] + gv2479: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1928: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2479, R.dtype("float16")) + _1926: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_27_fc1_weight, alloc1927, model_encoder_layers_27_fc1_bias, alloc1928) + R.vm.kill_object(alloc1927) + R.vm.kill_object(model_encoder_layers_27_fc1_weight) + R.vm.kill_object(model_encoder_layers_27_fc1_bias) + model_encoder_layers_27_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[421] + model_encoder_layers_27_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[422] + gv2480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1929: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2480, R.dtype("float16")) + _1927: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_27_fc2_weight, alloc1928, model_encoder_layers_27_fc2_bias, alloc1929) + R.vm.kill_object(alloc1928) + R.vm.kill_object(model_encoder_layers_27_fc2_weight) + R.vm.kill_object(model_encoder_layers_27_fc2_bias) + gv2481: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1930: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2481, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1926, alloc1929, alloc1930) + R.vm.kill_object(alloc1926) + R.vm.kill_object(alloc1929) + model_encoder_layers_28_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[432] + model_encoder_layers_28_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[433] + gv2482: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1931: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2482, R.dtype("float16")) + cls.layer_norm1(alloc1930, model_encoder_layers_28_self_attn_layer_norm_weight, model_encoder_layers_28_self_attn_layer_norm_bias, alloc1931) + R.vm.kill_object(model_encoder_layers_28_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_28_self_attn_layer_norm_bias) + model_encoder_layers_28_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[428] + model_encoder_layers_28_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[429] + gv2483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1932: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2483, R.dtype("float16")) + _1930: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_q_proj_weight, alloc1931, model_encoder_layers_28_self_attn_q_proj_bias, alloc1932) + R.vm.kill_object(model_encoder_layers_28_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_28_self_attn_q_proj_bias) + gv2484: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape224: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1932, gv2484, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1932) + model_encoder_layers_28_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[425] + gv2485: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1933: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2485, R.dtype("float16")) + _1931: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_28_self_attn_k_proj_weight, alloc1931, alloc1933) + R.vm.kill_object(model_encoder_layers_28_self_attn_k_proj_weight) + gv2486: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape225: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1933, gv2486, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1933) + model_encoder_layers_28_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[426] + model_encoder_layers_28_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[427] + gv2487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1934: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2487, R.dtype("float16")) + _1932: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_v_proj_weight, alloc1931, model_encoder_layers_28_self_attn_v_proj_bias, alloc1934) + R.vm.kill_object(alloc1931) + R.vm.kill_object(model_encoder_layers_28_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_28_self_attn_v_proj_bias) + gv2488: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape226: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1934, gv2488, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1934) + gv2489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape227: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape224, gv2489, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape224) + gv2490: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape228: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape225, gv2490, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape225) + gv2491: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape229: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape226, gv2491, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape226) + gv2492: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1935: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2492, R.dtype("float16")) + _1933: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape227, reshape228, reshape229, alloc1935) + R.vm.kill_object(reshape227) + R.vm.kill_object(reshape228) + R.vm.kill_object(reshape229) + gv2493: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape230: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1935, gv2493, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1935) + gv2494: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape231: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape230, gv2494, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape230) + model_encoder_layers_28_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[430] + model_encoder_layers_28_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[431] + gv2495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1936: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2495, R.dtype("float16")) + _1934: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_28_self_attn_out_proj_weight, reshape231, model_encoder_layers_28_self_attn_out_proj_bias, alloc1936) + R.vm.kill_object(reshape231) + R.vm.kill_object(model_encoder_layers_28_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_28_self_attn_out_proj_bias) + gv2496: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1937: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2496, R.dtype("float16")) + cls.add4(alloc1930, alloc1936, alloc1937) + R.vm.kill_object(alloc1930) + R.vm.kill_object(alloc1936) + model_encoder_layers_28_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[438] + model_encoder_layers_28_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[439] + gv2497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1938: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2497, R.dtype("float16")) + cls.layer_norm1(alloc1937, model_encoder_layers_28_final_layer_norm_weight, model_encoder_layers_28_final_layer_norm_bias, alloc1938) + R.vm.kill_object(model_encoder_layers_28_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_28_final_layer_norm_bias) + model_encoder_layers_28_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[434] + model_encoder_layers_28_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[435] + gv2498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1939: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2498, R.dtype("float16")) + _1937: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_28_fc1_weight, alloc1938, model_encoder_layers_28_fc1_bias, alloc1939) + R.vm.kill_object(alloc1938) + R.vm.kill_object(model_encoder_layers_28_fc1_weight) + R.vm.kill_object(model_encoder_layers_28_fc1_bias) + model_encoder_layers_28_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[436] + model_encoder_layers_28_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[437] + gv2499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1940: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2499, R.dtype("float16")) + _1938: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_28_fc2_weight, alloc1939, model_encoder_layers_28_fc2_bias, alloc1940) + R.vm.kill_object(alloc1939) + R.vm.kill_object(model_encoder_layers_28_fc2_weight) + R.vm.kill_object(model_encoder_layers_28_fc2_bias) + gv2500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1941: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2500, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1937, alloc1940, alloc1941) + R.vm.kill_object(alloc1937) + R.vm.kill_object(alloc1940) + model_encoder_layers_29_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[447] + model_encoder_layers_29_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[448] + gv2501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1942: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2501, R.dtype("float16")) + cls.layer_norm1(alloc1941, model_encoder_layers_29_self_attn_layer_norm_weight, model_encoder_layers_29_self_attn_layer_norm_bias, alloc1942) + R.vm.kill_object(model_encoder_layers_29_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_29_self_attn_layer_norm_bias) + model_encoder_layers_29_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[443] + model_encoder_layers_29_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[444] + gv2502: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1943: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2502, R.dtype("float16")) + _1941: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_q_proj_weight, alloc1942, model_encoder_layers_29_self_attn_q_proj_bias, alloc1943) + R.vm.kill_object(model_encoder_layers_29_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_29_self_attn_q_proj_bias) + gv2503: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape232: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1943, gv2503, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1943) + model_encoder_layers_29_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[440] + gv2504: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1944: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2504, R.dtype("float16")) + _1942: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_29_self_attn_k_proj_weight, alloc1942, alloc1944) + R.vm.kill_object(model_encoder_layers_29_self_attn_k_proj_weight) + gv2505: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape233: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1944, gv2505, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1944) + model_encoder_layers_29_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[441] + model_encoder_layers_29_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[442] + gv2506: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1945: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2506, R.dtype("float16")) + _1943: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_v_proj_weight, alloc1942, model_encoder_layers_29_self_attn_v_proj_bias, alloc1945) + R.vm.kill_object(alloc1942) + R.vm.kill_object(model_encoder_layers_29_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_29_self_attn_v_proj_bias) + gv2507: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape234: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1945, gv2507, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1945) + gv2508: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape235: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape232, gv2508, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape232) + gv2509: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape236: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape233, gv2509, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape233) + gv2510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape237: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape234, gv2510, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape234) + gv2511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1946: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2511, R.dtype("float16")) + _1944: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape235, reshape236, reshape237, alloc1946) + R.vm.kill_object(reshape235) + R.vm.kill_object(reshape236) + R.vm.kill_object(reshape237) + gv2512: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape238: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1946, gv2512, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1946) + gv2513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape239: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape238, gv2513, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape238) + model_encoder_layers_29_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[445] + model_encoder_layers_29_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[446] + gv2514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1947: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2514, R.dtype("float16")) + _1945: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_29_self_attn_out_proj_weight, reshape239, model_encoder_layers_29_self_attn_out_proj_bias, alloc1947) + R.vm.kill_object(reshape239) + R.vm.kill_object(model_encoder_layers_29_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_29_self_attn_out_proj_bias) + gv2515: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1948: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2515, R.dtype("float16")) + cls.add4(alloc1941, alloc1947, alloc1948) + R.vm.kill_object(alloc1941) + R.vm.kill_object(alloc1947) + model_encoder_layers_29_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[453] + model_encoder_layers_29_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[454] + gv2516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1949: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2516, R.dtype("float16")) + cls.layer_norm1(alloc1948, model_encoder_layers_29_final_layer_norm_weight, model_encoder_layers_29_final_layer_norm_bias, alloc1949) + R.vm.kill_object(model_encoder_layers_29_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_29_final_layer_norm_bias) + model_encoder_layers_29_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[449] + model_encoder_layers_29_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[450] + gv2517: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1950: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2517, R.dtype("float16")) + _1948: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_29_fc1_weight, alloc1949, model_encoder_layers_29_fc1_bias, alloc1950) + R.vm.kill_object(alloc1949) + R.vm.kill_object(model_encoder_layers_29_fc1_weight) + R.vm.kill_object(model_encoder_layers_29_fc1_bias) + model_encoder_layers_29_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[451] + model_encoder_layers_29_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[452] + gv2518: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1951: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2518, R.dtype("float16")) + _1949: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_29_fc2_weight, alloc1950, model_encoder_layers_29_fc2_bias, alloc1951) + R.vm.kill_object(alloc1950) + R.vm.kill_object(model_encoder_layers_29_fc2_weight) + R.vm.kill_object(model_encoder_layers_29_fc2_bias) + gv2519: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1952: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2519, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1948, alloc1951, alloc1952) + R.vm.kill_object(alloc1948) + R.vm.kill_object(alloc1951) + model_encoder_layers_30_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[462] + model_encoder_layers_30_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[463] + gv2520: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1953: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2520, R.dtype("float16")) + cls.layer_norm1(alloc1952, model_encoder_layers_30_self_attn_layer_norm_weight, model_encoder_layers_30_self_attn_layer_norm_bias, alloc1953) + R.vm.kill_object(model_encoder_layers_30_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_30_self_attn_layer_norm_bias) + model_encoder_layers_30_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[458] + model_encoder_layers_30_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[459] + gv2521: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1954: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2521, R.dtype("float16")) + _1952: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_q_proj_weight, alloc1953, model_encoder_layers_30_self_attn_q_proj_bias, alloc1954) + R.vm.kill_object(model_encoder_layers_30_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_30_self_attn_q_proj_bias) + gv2522: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape240: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1954, gv2522, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1954) + model_encoder_layers_30_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[455] + gv2523: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1955: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2523, R.dtype("float16")) + _1953: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_30_self_attn_k_proj_weight, alloc1953, alloc1955) + R.vm.kill_object(model_encoder_layers_30_self_attn_k_proj_weight) + gv2524: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape241: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1955, gv2524, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1955) + model_encoder_layers_30_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[456] + model_encoder_layers_30_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[457] + gv2525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1956: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2525, R.dtype("float16")) + _1954: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_v_proj_weight, alloc1953, model_encoder_layers_30_self_attn_v_proj_bias, alloc1956) + R.vm.kill_object(alloc1953) + R.vm.kill_object(model_encoder_layers_30_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_30_self_attn_v_proj_bias) + gv2526: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape242: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1956, gv2526, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1956) + gv2527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape243: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape240, gv2527, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape240) + gv2528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape244: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape241, gv2528, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape241) + gv2529: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape245: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape242, gv2529, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape242) + gv2530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1957: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2530, R.dtype("float16")) + _1955: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape243, reshape244, reshape245, alloc1957) + R.vm.kill_object(reshape243) + R.vm.kill_object(reshape244) + R.vm.kill_object(reshape245) + gv2531: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape246: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1957, gv2531, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1957) + gv2532: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape247: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape246, gv2532, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape246) + model_encoder_layers_30_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[460] + model_encoder_layers_30_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[461] + gv2533: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1958: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2533, R.dtype("float16")) + _1956: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_30_self_attn_out_proj_weight, reshape247, model_encoder_layers_30_self_attn_out_proj_bias, alloc1958) + R.vm.kill_object(reshape247) + R.vm.kill_object(model_encoder_layers_30_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_30_self_attn_out_proj_bias) + gv2534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1959: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2534, R.dtype("float16")) + cls.add4(alloc1952, alloc1958, alloc1959) + R.vm.kill_object(alloc1952) + R.vm.kill_object(alloc1958) + model_encoder_layers_30_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[468] + model_encoder_layers_30_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[469] + gv2535: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1960: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2535, R.dtype("float16")) + cls.layer_norm1(alloc1959, model_encoder_layers_30_final_layer_norm_weight, model_encoder_layers_30_final_layer_norm_bias, alloc1960) + R.vm.kill_object(model_encoder_layers_30_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_30_final_layer_norm_bias) + model_encoder_layers_30_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[464] + model_encoder_layers_30_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[465] + gv2536: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1961: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2536, R.dtype("float16")) + _1959: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_30_fc1_weight, alloc1960, model_encoder_layers_30_fc1_bias, alloc1961) + R.vm.kill_object(alloc1960) + R.vm.kill_object(model_encoder_layers_30_fc1_weight) + R.vm.kill_object(model_encoder_layers_30_fc1_bias) + model_encoder_layers_30_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[466] + model_encoder_layers_30_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[467] + gv2537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1962: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2537, R.dtype("float16")) + _1960: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_30_fc2_weight, alloc1961, model_encoder_layers_30_fc2_bias, alloc1962) + R.vm.kill_object(alloc1961) + R.vm.kill_object(model_encoder_layers_30_fc2_weight) + R.vm.kill_object(model_encoder_layers_30_fc2_bias) + gv2538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1963: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2538, R.dtype("float16")) + cls.fused_add4_maximum_minimum(alloc1959, alloc1962, alloc1963) + R.vm.kill_object(alloc1959) + R.vm.kill_object(alloc1962) + model_encoder_layers_31_self_attn_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[477] + model_encoder_layers_31_self_attn_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[478] + gv2539: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1964: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2539, R.dtype("float16")) + cls.layer_norm1(alloc1963, model_encoder_layers_31_self_attn_layer_norm_weight, model_encoder_layers_31_self_attn_layer_norm_bias, alloc1964) + R.vm.kill_object(model_encoder_layers_31_self_attn_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_31_self_attn_layer_norm_bias) + model_encoder_layers_31_self_attn_q_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[473] + model_encoder_layers_31_self_attn_q_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[474] + gv2540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1965: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2540, R.dtype("float16")) + _1963: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_q_proj_weight, alloc1964, model_encoder_layers_31_self_attn_q_proj_bias, alloc1965) + R.vm.kill_object(model_encoder_layers_31_self_attn_q_proj_weight) + R.vm.kill_object(model_encoder_layers_31_self_attn_q_proj_bias) + gv2541: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape248: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1965, gv2541, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1965) + model_encoder_layers_31_self_attn_k_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[470] + gv2542: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1966: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2542, R.dtype("float16")) + _1964: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_cublas", model_encoder_layers_31_self_attn_k_proj_weight, alloc1964, alloc1966) + R.vm.kill_object(model_encoder_layers_31_self_attn_k_proj_weight) + gv2543: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape249: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1966, gv2543, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1966) + model_encoder_layers_31_self_attn_v_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[471] + model_encoder_layers_31_self_attn_v_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[472] + gv2544: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1967: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2544, R.dtype("float16")) + _1965: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_v_proj_weight, alloc1964, model_encoder_layers_31_self_attn_v_proj_bias, alloc1967) + R.vm.kill_object(alloc1964) + R.vm.kill_object(model_encoder_layers_31_self_attn_v_proj_weight) + R.vm.kill_object(model_encoder_layers_31_self_attn_v_proj_bias) + gv2545: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape250: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1967, gv2545, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1967) + gv2546: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape251: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape248, gv2546, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape248) + gv2547: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape252: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape249, gv2547, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape249) + gv2548: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape253: R.Tensor((batch_size * 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape250, gv2548, sinfo_args=(R.Tensor((batch_size * 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape250) + gv2549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1968: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2549, R.dtype("float16")) + _1966: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_no_append", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape251, reshape252, reshape253, alloc1968) + R.vm.kill_object(reshape251) + R.vm.kill_object(reshape252) + R.vm.kill_object(reshape253) + gv2550: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape254: R.Tensor((batch_size, 1500, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1968, gv2550, sinfo_args=(R.Tensor((batch_size, 1500, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1968) + gv2551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape255: R.Tensor((batch_size, 1500, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape254, gv2551, sinfo_args=(R.Tensor((batch_size, 1500, 1280), dtype="float16"),)) + R.vm.kill_object(reshape254) + model_encoder_layers_31_self_attn_out_proj_weight: R.Tensor((1280, 1280), dtype="float16") = packed_params[475] + model_encoder_layers_31_self_attn_out_proj_bias: R.Tensor((1280,), dtype="float16") = packed_params[476] + gv2552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1969: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2552, R.dtype("float16")) + _1967: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_cublas", model_encoder_layers_31_self_attn_out_proj_weight, reshape255, model_encoder_layers_31_self_attn_out_proj_bias, alloc1969) + R.vm.kill_object(reshape255) + R.vm.kill_object(model_encoder_layers_31_self_attn_out_proj_weight) + R.vm.kill_object(model_encoder_layers_31_self_attn_out_proj_bias) + gv2553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1970: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage25, R.prim_value(0), gv2553, R.dtype("float16")) + R.vm.kill_object(storage25) + cls.add4(alloc1963, alloc1969, alloc1970) + R.vm.kill_object(alloc1963) + R.vm.kill_object(alloc1969) + model_encoder_layers_31_final_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[483] + model_encoder_layers_31_final_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[484] + gv2554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1971: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage28, R.prim_value(0), gv2554, R.dtype("float16")) + R.vm.kill_object(storage28) + cls.layer_norm1(alloc1970, model_encoder_layers_31_final_layer_norm_weight, model_encoder_layers_31_final_layer_norm_bias, alloc1971) + R.vm.kill_object(model_encoder_layers_31_final_layer_norm_weight) + R.vm.kill_object(model_encoder_layers_31_final_layer_norm_bias) + model_encoder_layers_31_fc1_weight: R.Tensor((5120, 1280), dtype="float16") = packed_params[479] + model_encoder_layers_31_fc1_bias: R.Tensor((5120,), dtype="float16") = packed_params[480] + gv2555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1972: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage24, R.prim_value(0), gv2555, R.dtype("float16")) + R.vm.kill_object(storage24) + _1970: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu2_cublas", model_encoder_layers_31_fc1_weight, alloc1971, model_encoder_layers_31_fc1_bias, alloc1972) + R.vm.kill_object(alloc1971) + R.vm.kill_object(model_encoder_layers_31_fc1_weight) + R.vm.kill_object(model_encoder_layers_31_fc1_bias) + model_encoder_layers_31_fc2_weight: R.Tensor((1280, 5120), dtype="float16") = packed_params[481] + model_encoder_layers_31_fc2_bias: R.Tensor((1280,), dtype="float16") = packed_params[482] + gv2556: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1973: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage26, R.prim_value(0), gv2556, R.dtype("float16")) + R.vm.kill_object(storage26) + _1971: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add5_cublas", model_encoder_layers_31_fc2_weight, alloc1972, model_encoder_layers_31_fc2_bias, alloc1973) + R.vm.kill_object(alloc1972) + R.vm.kill_object(model_encoder_layers_31_fc2_weight) + R.vm.kill_object(model_encoder_layers_31_fc2_bias) + gv2557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1974: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage27, R.prim_value(0), gv2557, R.dtype("float16")) + R.vm.kill_object(storage27) + cls.fused_add4_maximum_minimum(alloc1970, alloc1973, alloc1974) + R.vm.kill_object(alloc1970) + R.vm.kill_object(alloc1973) + model_encoder_layer_norm_weight: R.Tensor((1280,), dtype="float16") = packed_params[485] + model_encoder_layer_norm_bias: R.Tensor((1280,), dtype="float16") = packed_params[486] + storage29: R.Object = R.vm.alloc_storage(R.shape([30720000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2558: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1975: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage29, R.prim_value(0), gv2558, R.dtype("float16")) + R.vm.kill_object(storage29) + cls.layer_norm1(alloc1974, model_encoder_layer_norm_weight, model_encoder_layer_norm_bias, alloc1975) + R.vm.kill_object(alloc1974) + R.vm.kill_object(model_encoder_layer_norm_weight) + R.vm.kill_object(model_encoder_layer_norm_bias) + R.call_packed("vm.builtin.match_shape", alloc1975, shape_heap, R.prim_value(3), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(1500), R.prim_value(0), R.prim_value(1280), R.str("ErrorContext(fn=batch_encode, loc=return, annotation=R.Tensor((batch_size, 1500, 1280), dtype=\"float16\")) "), sinfo_args=(R.Tuple,)) + return alloc1975 + + @R.function + def batch_prefill(input_ids: R.Tensor((1, "seq_len"), dtype="int32"), logit_positions: R.Tensor(("batch_size",), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, "batch_size", 51866), dtype="float32"): + batch_size = T.int64() + seq_len = T.int64() + R.func_attr({"num_input": 3, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=batch_prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", logit_positions, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=batch_prefill, loc=param[1], param=logit_positions, annotation=R.Tensor((batch_size,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=batch_prefill, loc=param[3], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=batch_prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", logit_positions, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=batch_prefill, loc=param[1], param=logit_positions, annotation=R.Tensor((batch_size,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + model_decoder_embed_tokens_weight2: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + gv10: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) + reshape384: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv10, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) + model_decoder_embed_tokens_weight2_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + storage4: R.Object = R.vm.alloc_storage(R.shape([153600000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv11: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) + alloc4: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv11, R.dtype("float16")) + cls.take(model_decoder_embed_tokens_weight2_1, reshape384, alloc4) + R.vm.kill_object(reshape384) + R.vm.kill_object(model_decoder_embed_tokens_weight2_1) + gv12: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape385: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc4, gv12, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(alloc4) + lv68: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) + model_decoder_embed_positions_weight2: R.Tensor((448, 1280), dtype="float16") = packed_params[488] + storage5: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv13: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) + alloc5: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv13, R.dtype("float16")) + cls.take1(model_decoder_embed_positions_weight2, lv68, alloc5) + R.vm.kill_object(lv68) + R.vm.kill_object(model_decoder_embed_positions_weight2) + gv14: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape386: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc5, gv14, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(alloc5) + storage6: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv15: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc6: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv15, R.dtype("float16")) + cls.add5(reshape385, reshape386, alloc6) + R.vm.kill_object(reshape385) + R.vm.kill_object(reshape386) + model_decoder_layers_0_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[496] + model_decoder_layers_0_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[497] + gv16: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc7: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv16, R.dtype("float16")) + cls.layer_norm2(alloc6, model_decoder_layers_0_self_attn_layer_norm_weight2, model_decoder_layers_0_self_attn_layer_norm_bias2, alloc7) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias2) + model_decoder_layers_0_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] + model_decoder_layers_0_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[493] + gv17: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc8: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv17, R.dtype("float16")) + _6: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_q_proj_weight2, alloc7, model_decoder_layers_0_self_attn_q_proj_bias2, alloc8) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias2) + gv18: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape387: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc8, gv18, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc8) + model_decoder_layers_0_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] + storage7: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv19: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc9: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv19, R.dtype("float16")) + _7: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_0_self_attn_k_proj_weight2, alloc7, alloc9) + R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight2) + gv20: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape388: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc9, gv20, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc9) + model_decoder_layers_0_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] + model_decoder_layers_0_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[491] + storage8: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv21: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc10: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv21, R.dtype("float16")) + _8: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_v_proj_weight2, alloc7, model_decoder_layers_0_self_attn_v_proj_bias2, alloc10) + R.vm.kill_object(alloc7) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias2) + gv22: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape389: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc10, gv22, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc10) + gv23: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc11: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv23, R.dtype("float16")) + cls.concatenate1(reshape387, reshape388, reshape389, alloc11) + R.vm.kill_object(reshape387) + R.vm.kill_object(reshape388) + R.vm.kill_object(reshape389) + gv24: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape390: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc11, gv24, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc11) + gv25: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc12: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv25, R.dtype("float16")) + _10: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape390, alloc12) + R.vm.kill_object(reshape390) + gv26: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape391: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc12, gv26, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc12) + gv27: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape392: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape391, gv27, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape391) + model_decoder_layers_0_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] + model_decoder_layers_0_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[495] + gv28: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc13: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv28, R.dtype("float16")) + _11: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_out_proj_weight2, reshape392, model_decoder_layers_0_self_attn_out_proj_bias2, alloc13) + R.vm.kill_object(reshape392) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias2) + gv29: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc14: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv29, R.dtype("float16")) + cls.add5(alloc6, alloc13, alloc14) + R.vm.kill_object(alloc6) + R.vm.kill_object(alloc13) + model_decoder_layers_0_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[505] + model_decoder_layers_0_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[506] + gv30: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc15: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv30, R.dtype("float16")) + cls.layer_norm2(alloc14, model_decoder_layers_0_encoder_attn_layer_norm_weight2, model_decoder_layers_0_encoder_attn_layer_norm_bias2, alloc15) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias2) + model_decoder_layers_0_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] + model_decoder_layers_0_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[502] + gv31: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc16: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv31, R.dtype("float16")) + _14: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight2, alloc15, model_decoder_layers_0_encoder_attn_q_proj_bias2, alloc16) + R.vm.kill_object(alloc15) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias2) + gv32: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape393: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc16, gv32, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc16) + gv33: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape394: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape393, gv33, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape393) + gv34: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc17: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv34, R.dtype("float16")) + _15: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape394, alloc17) + R.vm.kill_object(reshape394) + gv35: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape395: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc17, gv35, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc17) + gv36: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape396: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape395, gv36, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape395) + model_decoder_layers_0_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] + model_decoder_layers_0_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[504] + gv37: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc18: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv37, R.dtype("float16")) + _16: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight2, reshape396, model_decoder_layers_0_encoder_attn_out_proj_bias2, alloc18) + R.vm.kill_object(reshape396) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias2) + gv38: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc19: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv38, R.dtype("float16")) + cls.add5(alloc14, alloc18, alloc19) + R.vm.kill_object(alloc14) + R.vm.kill_object(alloc18) + model_decoder_layers_0_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[511] + model_decoder_layers_0_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[512] + gv39: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc20: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv39, R.dtype("float16")) + cls.layer_norm2(alloc19, model_decoder_layers_0_final_layer_norm_weight2, model_decoder_layers_0_final_layer_norm_bias2, alloc20) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias2) + model_decoder_layers_0_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] + model_decoder_layers_0_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[508] + gv40: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc21: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv40, R.dtype("float16")) + _19: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_0_fc1_weight2, alloc20, model_decoder_layers_0_fc1_bias2, alloc21) + R.vm.kill_object(alloc20) + R.vm.kill_object(model_decoder_layers_0_fc1_weight2) + R.vm.kill_object(model_decoder_layers_0_fc1_bias2) + model_decoder_layers_0_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] + model_decoder_layers_0_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[510] + gv41: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc22: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv41, R.dtype("float16")) + _20: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_0_fc2_weight2, alloc21, model_decoder_layers_0_fc2_bias2, alloc22) + R.vm.kill_object(alloc21) + R.vm.kill_object(model_decoder_layers_0_fc2_weight2) + R.vm.kill_object(model_decoder_layers_0_fc2_bias2) + gv42: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc23: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv42, R.dtype("float16")) + cls.add5(alloc19, alloc22, alloc23) + R.vm.kill_object(alloc19) + R.vm.kill_object(alloc22) + model_decoder_layers_1_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[520] + model_decoder_layers_1_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[521] + gv43: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc24: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv43, R.dtype("float16")) + cls.layer_norm2(alloc23, model_decoder_layers_1_self_attn_layer_norm_weight2, model_decoder_layers_1_self_attn_layer_norm_bias2, alloc24) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias2) + model_decoder_layers_1_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] + model_decoder_layers_1_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[517] + gv44: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc25: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv44, R.dtype("float16")) + _23: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_q_proj_weight2, alloc24, model_decoder_layers_1_self_attn_q_proj_bias2, alloc25) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias2) + gv45: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape397: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc25, gv45, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc25) + model_decoder_layers_1_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] + gv46: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc26: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv46, R.dtype("float16")) + _24: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_1_self_attn_k_proj_weight2, alloc24, alloc26) + R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight2) + gv47: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape398: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc26, gv47, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc26) + model_decoder_layers_1_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] + model_decoder_layers_1_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[515] + gv48: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc27: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv48, R.dtype("float16")) + _25: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_v_proj_weight2, alloc24, model_decoder_layers_1_self_attn_v_proj_bias2, alloc27) + R.vm.kill_object(alloc24) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias2) + gv49: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape399: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc27, gv49, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc27) + gv50: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc28: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv50, R.dtype("float16")) + cls.concatenate1(reshape397, reshape398, reshape399, alloc28) + R.vm.kill_object(reshape397) + R.vm.kill_object(reshape398) + R.vm.kill_object(reshape399) + gv51: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape400: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc28, gv51, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc28) + gv52: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc29: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv52, R.dtype("float16")) + _27: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape400, alloc29) + R.vm.kill_object(reshape400) + gv53: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape401: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc29, gv53, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc29) + gv54: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape402: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape401, gv54, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape401) + model_decoder_layers_1_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] + model_decoder_layers_1_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[519] + gv55: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc30: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv55, R.dtype("float16")) + _28: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_out_proj_weight2, reshape402, model_decoder_layers_1_self_attn_out_proj_bias2, alloc30) + R.vm.kill_object(reshape402) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias2) + gv56: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc31: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv56, R.dtype("float16")) + cls.add5(alloc23, alloc30, alloc31) + R.vm.kill_object(alloc23) + R.vm.kill_object(alloc30) + model_decoder_layers_1_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[529] + model_decoder_layers_1_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[530] + gv57: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc32: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv57, R.dtype("float16")) + cls.layer_norm2(alloc31, model_decoder_layers_1_encoder_attn_layer_norm_weight2, model_decoder_layers_1_encoder_attn_layer_norm_bias2, alloc32) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias2) + model_decoder_layers_1_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] + model_decoder_layers_1_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[526] + gv58: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc33: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv58, R.dtype("float16")) + _31: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight2, alloc32, model_decoder_layers_1_encoder_attn_q_proj_bias2, alloc33) + R.vm.kill_object(alloc32) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias2) + gv59: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape403: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc33, gv59, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc33) + gv60: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape404: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape403, gv60, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape403) + gv61: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc34: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv61, R.dtype("float16")) + _32: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape404, alloc34) + R.vm.kill_object(reshape404) + gv62: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape405: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc34, gv62, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc34) + gv63: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape406: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape405, gv63, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape405) + model_decoder_layers_1_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] + model_decoder_layers_1_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[528] + gv64: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc35: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv64, R.dtype("float16")) + _33: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight2, reshape406, model_decoder_layers_1_encoder_attn_out_proj_bias2, alloc35) + R.vm.kill_object(reshape406) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias2) + gv65: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc36: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv65, R.dtype("float16")) + cls.add5(alloc31, alloc35, alloc36) + R.vm.kill_object(alloc31) + R.vm.kill_object(alloc35) + model_decoder_layers_1_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[535] + model_decoder_layers_1_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[536] + gv66: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc37: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv66, R.dtype("float16")) + cls.layer_norm2(alloc36, model_decoder_layers_1_final_layer_norm_weight2, model_decoder_layers_1_final_layer_norm_bias2, alloc37) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias2) + model_decoder_layers_1_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] + model_decoder_layers_1_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[532] + gv67: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc38: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv67, R.dtype("float16")) + _36: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_1_fc1_weight2, alloc37, model_decoder_layers_1_fc1_bias2, alloc38) + R.vm.kill_object(alloc37) + R.vm.kill_object(model_decoder_layers_1_fc1_weight2) + R.vm.kill_object(model_decoder_layers_1_fc1_bias2) + model_decoder_layers_1_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] + model_decoder_layers_1_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[534] + gv68: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc39: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv68, R.dtype("float16")) + _37: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_1_fc2_weight2, alloc38, model_decoder_layers_1_fc2_bias2, alloc39) + R.vm.kill_object(alloc38) + R.vm.kill_object(model_decoder_layers_1_fc2_weight2) + R.vm.kill_object(model_decoder_layers_1_fc2_bias2) + gv69: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc40: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv69, R.dtype("float16")) + cls.add5(alloc36, alloc39, alloc40) + R.vm.kill_object(alloc36) + R.vm.kill_object(alloc39) + model_decoder_layers_2_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[544] + model_decoder_layers_2_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[545] + gv70: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc41: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv70, R.dtype("float16")) + cls.layer_norm2(alloc40, model_decoder_layers_2_self_attn_layer_norm_weight2, model_decoder_layers_2_self_attn_layer_norm_bias2, alloc41) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias2) + model_decoder_layers_2_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] + model_decoder_layers_2_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[541] + gv71: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc42: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv71, R.dtype("float16")) + _40: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_q_proj_weight2, alloc41, model_decoder_layers_2_self_attn_q_proj_bias2, alloc42) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias2) + gv72: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape407: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc42, gv72, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc42) + model_decoder_layers_2_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] + gv73: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc43: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv73, R.dtype("float16")) + _41: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_2_self_attn_k_proj_weight2, alloc41, alloc43) + R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight2) + gv74: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape408: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc43, gv74, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc43) + model_decoder_layers_2_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] + model_decoder_layers_2_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[539] + gv75: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc44: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv75, R.dtype("float16")) + _42: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_v_proj_weight2, alloc41, model_decoder_layers_2_self_attn_v_proj_bias2, alloc44) + R.vm.kill_object(alloc41) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias2) + gv76: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape409: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc44, gv76, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc44) + gv77: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc45: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv77, R.dtype("float16")) + cls.concatenate1(reshape407, reshape408, reshape409, alloc45) + R.vm.kill_object(reshape407) + R.vm.kill_object(reshape408) + R.vm.kill_object(reshape409) + gv78: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape410: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc45, gv78, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc45) + gv79: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc46: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv79, R.dtype("float16")) + _44: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape410, alloc46) + R.vm.kill_object(reshape410) + gv80: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape411: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc46, gv80, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc46) + gv81: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape412: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape411, gv81, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape411) + model_decoder_layers_2_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] + model_decoder_layers_2_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[543] + gv82: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc47: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv82, R.dtype("float16")) + _45: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_out_proj_weight2, reshape412, model_decoder_layers_2_self_attn_out_proj_bias2, alloc47) + R.vm.kill_object(reshape412) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias2) + gv83: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc48: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv83, R.dtype("float16")) + cls.add5(alloc40, alloc47, alloc48) + R.vm.kill_object(alloc40) + R.vm.kill_object(alloc47) + model_decoder_layers_2_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[553] + model_decoder_layers_2_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[554] + gv84: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc49: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv84, R.dtype("float16")) + cls.layer_norm2(alloc48, model_decoder_layers_2_encoder_attn_layer_norm_weight2, model_decoder_layers_2_encoder_attn_layer_norm_bias2, alloc49) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias2) + model_decoder_layers_2_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] + model_decoder_layers_2_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[550] + gv85: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc50: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv85, R.dtype("float16")) + _48: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight2, alloc49, model_decoder_layers_2_encoder_attn_q_proj_bias2, alloc50) + R.vm.kill_object(alloc49) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias2) + gv86: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape413: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc50, gv86, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc50) + gv87: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape414: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape413, gv87, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape413) + gv88: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc51: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv88, R.dtype("float16")) + _49: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape414, alloc51) + R.vm.kill_object(reshape414) + gv89: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape415: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc51, gv89, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc51) + gv90: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape416: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape415, gv90, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape415) + model_decoder_layers_2_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] + model_decoder_layers_2_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[552] + gv91: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc52: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv91, R.dtype("float16")) + _50: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight2, reshape416, model_decoder_layers_2_encoder_attn_out_proj_bias2, alloc52) + R.vm.kill_object(reshape416) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias2) + gv92: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc53: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv92, R.dtype("float16")) + cls.add5(alloc48, alloc52, alloc53) + R.vm.kill_object(alloc48) + R.vm.kill_object(alloc52) + model_decoder_layers_2_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[559] + model_decoder_layers_2_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[560] + gv93: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc54: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv93, R.dtype("float16")) + cls.layer_norm2(alloc53, model_decoder_layers_2_final_layer_norm_weight2, model_decoder_layers_2_final_layer_norm_bias2, alloc54) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias2) + model_decoder_layers_2_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] + model_decoder_layers_2_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[556] + gv94: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc55: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv94, R.dtype("float16")) + _53: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_2_fc1_weight2, alloc54, model_decoder_layers_2_fc1_bias2, alloc55) + R.vm.kill_object(alloc54) + R.vm.kill_object(model_decoder_layers_2_fc1_weight2) + R.vm.kill_object(model_decoder_layers_2_fc1_bias2) + model_decoder_layers_2_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] + model_decoder_layers_2_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[558] + gv95: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc56: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv95, R.dtype("float16")) + _54: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_2_fc2_weight2, alloc55, model_decoder_layers_2_fc2_bias2, alloc56) + R.vm.kill_object(alloc55) + R.vm.kill_object(model_decoder_layers_2_fc2_weight2) + R.vm.kill_object(model_decoder_layers_2_fc2_bias2) + gv96: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc57: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv96, R.dtype("float16")) + cls.add5(alloc53, alloc56, alloc57) + R.vm.kill_object(alloc53) + R.vm.kill_object(alloc56) + model_decoder_layers_3_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[568] + model_decoder_layers_3_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[569] + gv97: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc58: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv97, R.dtype("float16")) + cls.layer_norm2(alloc57, model_decoder_layers_3_self_attn_layer_norm_weight2, model_decoder_layers_3_self_attn_layer_norm_bias2, alloc58) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias2) + model_decoder_layers_3_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] + model_decoder_layers_3_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[565] + gv98: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc59: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv98, R.dtype("float16")) + _57: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_q_proj_weight2, alloc58, model_decoder_layers_3_self_attn_q_proj_bias2, alloc59) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias2) + gv99: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape417: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc59, gv99, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc59) + model_decoder_layers_3_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] + gv100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc60: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv100, R.dtype("float16")) + _58: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_3_self_attn_k_proj_weight2, alloc58, alloc60) + R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight2) + gv101: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape418: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc60, gv101, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc60) + model_decoder_layers_3_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] + model_decoder_layers_3_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[563] + gv102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc61: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv102, R.dtype("float16")) + _59: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_v_proj_weight2, alloc58, model_decoder_layers_3_self_attn_v_proj_bias2, alloc61) + R.vm.kill_object(alloc58) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias2) + gv103: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape419: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc61, gv103, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc61) + gv104: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc62: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv104, R.dtype("float16")) + cls.concatenate1(reshape417, reshape418, reshape419, alloc62) + R.vm.kill_object(reshape417) + R.vm.kill_object(reshape418) + R.vm.kill_object(reshape419) + gv105: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape420: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc62, gv105, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc62) + gv106: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc63: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv106, R.dtype("float16")) + _61: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape420, alloc63) + R.vm.kill_object(reshape420) + gv107: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape421: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc63, gv107, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc63) + gv108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape422: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape421, gv108, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape421) + model_decoder_layers_3_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] + model_decoder_layers_3_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[567] + gv109: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc64: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv109, R.dtype("float16")) + _62: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_out_proj_weight2, reshape422, model_decoder_layers_3_self_attn_out_proj_bias2, alloc64) + R.vm.kill_object(reshape422) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias2) + gv110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc65: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv110, R.dtype("float16")) + cls.add5(alloc57, alloc64, alloc65) + R.vm.kill_object(alloc57) + R.vm.kill_object(alloc64) + model_decoder_layers_3_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[577] + model_decoder_layers_3_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[578] + gv111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc66: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv111, R.dtype("float16")) + cls.layer_norm2(alloc65, model_decoder_layers_3_encoder_attn_layer_norm_weight2, model_decoder_layers_3_encoder_attn_layer_norm_bias2, alloc66) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias2) + model_decoder_layers_3_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] + model_decoder_layers_3_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[574] + gv112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc67: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv112, R.dtype("float16")) + _65: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight2, alloc66, model_decoder_layers_3_encoder_attn_q_proj_bias2, alloc67) + R.vm.kill_object(alloc66) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias2) + gv113: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape423: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc67, gv113, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc67) + gv114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape424: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape423, gv114, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape423) + gv115: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc68: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv115, R.dtype("float16")) + _66: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape424, alloc68) + R.vm.kill_object(reshape424) + gv116: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape425: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc68, gv116, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc68) + gv117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape426: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape425, gv117, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape425) + model_decoder_layers_3_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] + model_decoder_layers_3_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[576] + gv118: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc69: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv118, R.dtype("float16")) + _67: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight2, reshape426, model_decoder_layers_3_encoder_attn_out_proj_bias2, alloc69) + R.vm.kill_object(reshape426) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias2) + gv119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc70: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv119, R.dtype("float16")) + cls.add5(alloc65, alloc69, alloc70) + R.vm.kill_object(alloc65) + R.vm.kill_object(alloc69) + model_decoder_layers_3_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[583] + model_decoder_layers_3_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[584] + gv120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc71: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv120, R.dtype("float16")) + cls.layer_norm2(alloc70, model_decoder_layers_3_final_layer_norm_weight2, model_decoder_layers_3_final_layer_norm_bias2, alloc71) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias2) + model_decoder_layers_3_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] + model_decoder_layers_3_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[580] + gv121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc72: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv121, R.dtype("float16")) + _70: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_3_fc1_weight2, alloc71, model_decoder_layers_3_fc1_bias2, alloc72) + R.vm.kill_object(alloc71) + R.vm.kill_object(model_decoder_layers_3_fc1_weight2) + R.vm.kill_object(model_decoder_layers_3_fc1_bias2) + model_decoder_layers_3_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] + model_decoder_layers_3_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[582] + gv122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc73: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv122, R.dtype("float16")) + _71: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_3_fc2_weight2, alloc72, model_decoder_layers_3_fc2_bias2, alloc73) + R.vm.kill_object(alloc72) + R.vm.kill_object(model_decoder_layers_3_fc2_weight2) + R.vm.kill_object(model_decoder_layers_3_fc2_bias2) + gv123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc74: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv123, R.dtype("float16")) + cls.add5(alloc70, alloc73, alloc74) + R.vm.kill_object(alloc70) + R.vm.kill_object(alloc73) + model_decoder_layers_4_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[592] + model_decoder_layers_4_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[593] + gv124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc75: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv124, R.dtype("float16")) + cls.layer_norm2(alloc74, model_decoder_layers_4_self_attn_layer_norm_weight2, model_decoder_layers_4_self_attn_layer_norm_bias2, alloc75) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias2) + model_decoder_layers_4_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] + model_decoder_layers_4_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[589] + gv125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc76: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv125, R.dtype("float16")) + _74: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_q_proj_weight2, alloc75, model_decoder_layers_4_self_attn_q_proj_bias2, alloc76) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias2) + gv126: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape427: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc76, gv126, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc76) + model_decoder_layers_4_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] + gv127: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc77: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv127, R.dtype("float16")) + _75: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_4_self_attn_k_proj_weight2, alloc75, alloc77) + R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight2) + gv128: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape428: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc77, gv128, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc77) + model_decoder_layers_4_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] + model_decoder_layers_4_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[587] + gv129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc78: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv129, R.dtype("float16")) + _76: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_v_proj_weight2, alloc75, model_decoder_layers_4_self_attn_v_proj_bias2, alloc78) + R.vm.kill_object(alloc75) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias2) + gv130: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape429: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc78, gv130, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc78) + gv131: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc79: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv131, R.dtype("float16")) + cls.concatenate1(reshape427, reshape428, reshape429, alloc79) + R.vm.kill_object(reshape427) + R.vm.kill_object(reshape428) + R.vm.kill_object(reshape429) + gv132: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape430: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc79, gv132, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc79) + gv133: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc80: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv133, R.dtype("float16")) + _78: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape430, alloc80) + R.vm.kill_object(reshape430) + gv134: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape431: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc80, gv134, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc80) + gv135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape432: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape431, gv135, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape431) + model_decoder_layers_4_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] + model_decoder_layers_4_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[591] + gv136: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc81: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv136, R.dtype("float16")) + _79: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_out_proj_weight2, reshape432, model_decoder_layers_4_self_attn_out_proj_bias2, alloc81) + R.vm.kill_object(reshape432) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias2) + gv137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc82: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv137, R.dtype("float16")) + cls.add5(alloc74, alloc81, alloc82) + R.vm.kill_object(alloc74) + R.vm.kill_object(alloc81) + model_decoder_layers_4_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[601] + model_decoder_layers_4_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[602] + gv138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc83: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv138, R.dtype("float16")) + cls.layer_norm2(alloc82, model_decoder_layers_4_encoder_attn_layer_norm_weight2, model_decoder_layers_4_encoder_attn_layer_norm_bias2, alloc83) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias2) + model_decoder_layers_4_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] + model_decoder_layers_4_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[598] + gv139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc84: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv139, R.dtype("float16")) + _82: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight2, alloc83, model_decoder_layers_4_encoder_attn_q_proj_bias2, alloc84) + R.vm.kill_object(alloc83) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias2) + gv140: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape433: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc84, gv140, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc84) + gv141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape434: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape433, gv141, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape433) + gv142: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc85: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv142, R.dtype("float16")) + _83: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape434, alloc85) + R.vm.kill_object(reshape434) + gv143: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape435: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc85, gv143, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc85) + gv144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape436: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape435, gv144, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape435) + model_decoder_layers_4_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] + model_decoder_layers_4_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[600] + gv145: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc86: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv145, R.dtype("float16")) + _84: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight2, reshape436, model_decoder_layers_4_encoder_attn_out_proj_bias2, alloc86) + R.vm.kill_object(reshape436) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias2) + gv146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc87: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv146, R.dtype("float16")) + cls.add5(alloc82, alloc86, alloc87) + R.vm.kill_object(alloc82) + R.vm.kill_object(alloc86) + model_decoder_layers_4_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[607] + model_decoder_layers_4_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[608] + gv147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc88: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv147, R.dtype("float16")) + cls.layer_norm2(alloc87, model_decoder_layers_4_final_layer_norm_weight2, model_decoder_layers_4_final_layer_norm_bias2, alloc88) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias2) + model_decoder_layers_4_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] + model_decoder_layers_4_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[604] + gv148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc89: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv148, R.dtype("float16")) + _87: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_4_fc1_weight2, alloc88, model_decoder_layers_4_fc1_bias2, alloc89) + R.vm.kill_object(alloc88) + R.vm.kill_object(model_decoder_layers_4_fc1_weight2) + R.vm.kill_object(model_decoder_layers_4_fc1_bias2) + model_decoder_layers_4_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] + model_decoder_layers_4_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[606] + gv149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc90: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv149, R.dtype("float16")) + _88: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_4_fc2_weight2, alloc89, model_decoder_layers_4_fc2_bias2, alloc90) + R.vm.kill_object(alloc89) + R.vm.kill_object(model_decoder_layers_4_fc2_weight2) + R.vm.kill_object(model_decoder_layers_4_fc2_bias2) + gv150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc91: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv150, R.dtype("float16")) + cls.add5(alloc87, alloc90, alloc91) + R.vm.kill_object(alloc87) + R.vm.kill_object(alloc90) + model_decoder_layers_5_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[616] + model_decoder_layers_5_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[617] + gv151: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc92: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv151, R.dtype("float16")) + cls.layer_norm2(alloc91, model_decoder_layers_5_self_attn_layer_norm_weight2, model_decoder_layers_5_self_attn_layer_norm_bias2, alloc92) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias2) + model_decoder_layers_5_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] + model_decoder_layers_5_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[613] + gv152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc93: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv152, R.dtype("float16")) + _91: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_q_proj_weight2, alloc92, model_decoder_layers_5_self_attn_q_proj_bias2, alloc93) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias2) + gv153: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape437: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc93, gv153, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc93) + model_decoder_layers_5_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] + gv154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc94: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv154, R.dtype("float16")) + _92: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_5_self_attn_k_proj_weight2, alloc92, alloc94) + R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight2) + gv155: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape438: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc94, gv155, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc94) + model_decoder_layers_5_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] + model_decoder_layers_5_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[611] + gv156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc95: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv156, R.dtype("float16")) + _93: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_v_proj_weight2, alloc92, model_decoder_layers_5_self_attn_v_proj_bias2, alloc95) + R.vm.kill_object(alloc92) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias2) + gv157: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape439: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc95, gv157, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc95) + gv158: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc96: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv158, R.dtype("float16")) + cls.concatenate1(reshape437, reshape438, reshape439, alloc96) + R.vm.kill_object(reshape437) + R.vm.kill_object(reshape438) + R.vm.kill_object(reshape439) + gv159: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape440: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc96, gv159, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc96) + gv160: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc97: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv160, R.dtype("float16")) + _95: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape440, alloc97) + R.vm.kill_object(reshape440) + gv161: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape441: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc97, gv161, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc97) + gv162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape442: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape441, gv162, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape441) + model_decoder_layers_5_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] + model_decoder_layers_5_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[615] + gv163: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc98: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv163, R.dtype("float16")) + _96: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_out_proj_weight2, reshape442, model_decoder_layers_5_self_attn_out_proj_bias2, alloc98) + R.vm.kill_object(reshape442) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias2) + gv164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc99: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv164, R.dtype("float16")) + cls.add5(alloc91, alloc98, alloc99) + R.vm.kill_object(alloc91) + R.vm.kill_object(alloc98) + model_decoder_layers_5_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[625] + model_decoder_layers_5_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[626] + gv165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv165, R.dtype("float16")) + cls.layer_norm2(alloc99, model_decoder_layers_5_encoder_attn_layer_norm_weight2, model_decoder_layers_5_encoder_attn_layer_norm_bias2, alloc100) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias2) + model_decoder_layers_5_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] + model_decoder_layers_5_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[622] + gv166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc101: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv166, R.dtype("float16")) + _99: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight2, alloc100, model_decoder_layers_5_encoder_attn_q_proj_bias2, alloc101) + R.vm.kill_object(alloc100) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias2) + gv167: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape443: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc101, gv167, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc101) + gv168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape444: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape443, gv168, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape443) + gv169: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv169, R.dtype("float16")) + _100: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape444, alloc102) + R.vm.kill_object(reshape444) + gv170: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape445: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc102, gv170, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc102) + gv171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape446: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape445, gv171, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape445) + model_decoder_layers_5_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] + model_decoder_layers_5_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[624] + gv172: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv172, R.dtype("float16")) + _101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight2, reshape446, model_decoder_layers_5_encoder_attn_out_proj_bias2, alloc103) + R.vm.kill_object(reshape446) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias2) + gv173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv173, R.dtype("float16")) + cls.add5(alloc99, alloc103, alloc104) + R.vm.kill_object(alloc99) + R.vm.kill_object(alloc103) + model_decoder_layers_5_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[631] + model_decoder_layers_5_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[632] + gv174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv174, R.dtype("float16")) + cls.layer_norm2(alloc104, model_decoder_layers_5_final_layer_norm_weight2, model_decoder_layers_5_final_layer_norm_bias2, alloc105) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias2) + model_decoder_layers_5_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] + model_decoder_layers_5_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[628] + gv175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv175, R.dtype("float16")) + _104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_5_fc1_weight2, alloc105, model_decoder_layers_5_fc1_bias2, alloc106) + R.vm.kill_object(alloc105) + R.vm.kill_object(model_decoder_layers_5_fc1_weight2) + R.vm.kill_object(model_decoder_layers_5_fc1_bias2) + model_decoder_layers_5_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] + model_decoder_layers_5_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[630] + gv176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv176, R.dtype("float16")) + _105: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_5_fc2_weight2, alloc106, model_decoder_layers_5_fc2_bias2, alloc107) + R.vm.kill_object(alloc106) + R.vm.kill_object(model_decoder_layers_5_fc2_weight2) + R.vm.kill_object(model_decoder_layers_5_fc2_bias2) + gv177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc108: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv177, R.dtype("float16")) + cls.add5(alloc104, alloc107, alloc108) + R.vm.kill_object(alloc104) + R.vm.kill_object(alloc107) + model_decoder_layers_6_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[640] + model_decoder_layers_6_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[641] + gv178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv178, R.dtype("float16")) + cls.layer_norm2(alloc108, model_decoder_layers_6_self_attn_layer_norm_weight2, model_decoder_layers_6_self_attn_layer_norm_bias2, alloc109) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias2) + model_decoder_layers_6_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] + model_decoder_layers_6_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[637] + gv179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv179, R.dtype("float16")) + _108: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_q_proj_weight2, alloc109, model_decoder_layers_6_self_attn_q_proj_bias2, alloc110) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias2) + gv180: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape447: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc110, gv180, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc110) + model_decoder_layers_6_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] + gv181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv181, R.dtype("float16")) + _109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_6_self_attn_k_proj_weight2, alloc109, alloc111) + R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight2) + gv182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape448: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc111, gv182, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc111) + model_decoder_layers_6_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] + model_decoder_layers_6_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[635] + gv183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv183, R.dtype("float16")) + _110: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_v_proj_weight2, alloc109, model_decoder_layers_6_self_attn_v_proj_bias2, alloc112) + R.vm.kill_object(alloc109) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias2) + gv184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape449: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc112, gv184, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc112) + gv185: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc113: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv185, R.dtype("float16")) + cls.concatenate1(reshape447, reshape448, reshape449, alloc113) + R.vm.kill_object(reshape447) + R.vm.kill_object(reshape448) + R.vm.kill_object(reshape449) + gv186: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape450: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc113, gv186, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc113) + gv187: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv187, R.dtype("float16")) + _112: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape450, alloc114) + R.vm.kill_object(reshape450) + gv188: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape451: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc114, gv188, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc114) + gv189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape452: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape451, gv189, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape451) + model_decoder_layers_6_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] + model_decoder_layers_6_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[639] + gv190: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv190, R.dtype("float16")) + _113: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_out_proj_weight2, reshape452, model_decoder_layers_6_self_attn_out_proj_bias2, alloc115) + R.vm.kill_object(reshape452) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias2) + gv191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv191, R.dtype("float16")) + cls.add5(alloc108, alloc115, alloc116) + R.vm.kill_object(alloc108) + R.vm.kill_object(alloc115) + model_decoder_layers_6_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[649] + model_decoder_layers_6_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[650] + gv192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv192, R.dtype("float16")) + cls.layer_norm2(alloc116, model_decoder_layers_6_encoder_attn_layer_norm_weight2, model_decoder_layers_6_encoder_attn_layer_norm_bias2, alloc117) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias2) + model_decoder_layers_6_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] + model_decoder_layers_6_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[646] + gv193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc118: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv193, R.dtype("float16")) + _116: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight2, alloc117, model_decoder_layers_6_encoder_attn_q_proj_bias2, alloc118) + R.vm.kill_object(alloc117) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias2) + gv194: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape453: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc118, gv194, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc118) + gv195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape454: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape453, gv195, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape453) + gv196: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv196, R.dtype("float16")) + _117: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape454, alloc119) + R.vm.kill_object(reshape454) + gv197: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape455: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc119, gv197, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc119) + gv198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape456: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape455, gv198, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape455) + model_decoder_layers_6_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] + model_decoder_layers_6_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[648] + gv199: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv199, R.dtype("float16")) + _118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight2, reshape456, model_decoder_layers_6_encoder_attn_out_proj_bias2, alloc120) + R.vm.kill_object(reshape456) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias2) + gv200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv200, R.dtype("float16")) + cls.add5(alloc116, alloc120, alloc121) + R.vm.kill_object(alloc116) + R.vm.kill_object(alloc120) + model_decoder_layers_6_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[655] + model_decoder_layers_6_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[656] + gv201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv201, R.dtype("float16")) + cls.layer_norm2(alloc121, model_decoder_layers_6_final_layer_norm_weight2, model_decoder_layers_6_final_layer_norm_bias2, alloc122) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias2) + model_decoder_layers_6_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] + model_decoder_layers_6_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[652] + gv202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv202, R.dtype("float16")) + _121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_6_fc1_weight2, alloc122, model_decoder_layers_6_fc1_bias2, alloc123) + R.vm.kill_object(alloc122) + R.vm.kill_object(model_decoder_layers_6_fc1_weight2) + R.vm.kill_object(model_decoder_layers_6_fc1_bias2) + model_decoder_layers_6_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] + model_decoder_layers_6_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[654] + gv203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv203, R.dtype("float16")) + _122: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_6_fc2_weight2, alloc123, model_decoder_layers_6_fc2_bias2, alloc124) + R.vm.kill_object(alloc123) + R.vm.kill_object(model_decoder_layers_6_fc2_weight2) + R.vm.kill_object(model_decoder_layers_6_fc2_bias2) + gv204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc125: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv204, R.dtype("float16")) + cls.add5(alloc121, alloc124, alloc125) + R.vm.kill_object(alloc121) + R.vm.kill_object(alloc124) + model_decoder_layers_7_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[664] + model_decoder_layers_7_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[665] + gv205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv205, R.dtype("float16")) + cls.layer_norm2(alloc125, model_decoder_layers_7_self_attn_layer_norm_weight2, model_decoder_layers_7_self_attn_layer_norm_bias2, alloc126) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias2) + model_decoder_layers_7_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] + model_decoder_layers_7_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[661] + gv206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv206, R.dtype("float16")) + _125: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_q_proj_weight2, alloc126, model_decoder_layers_7_self_attn_q_proj_bias2, alloc127) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias2) + gv207: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape457: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc127, gv207, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc127) + model_decoder_layers_7_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] + gv208: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv208, R.dtype("float16")) + _126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_7_self_attn_k_proj_weight2, alloc126, alloc128) + R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight2) + gv209: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape458: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc128, gv209, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc128) + model_decoder_layers_7_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] + model_decoder_layers_7_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[659] + gv210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv210, R.dtype("float16")) + _127: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_v_proj_weight2, alloc126, model_decoder_layers_7_self_attn_v_proj_bias2, alloc129) + R.vm.kill_object(alloc126) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias2) + gv211: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape459: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc129, gv211, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc129) + gv212: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc130: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv212, R.dtype("float16")) + cls.concatenate1(reshape457, reshape458, reshape459, alloc130) + R.vm.kill_object(reshape457) + R.vm.kill_object(reshape458) + R.vm.kill_object(reshape459) + gv213: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape460: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc130, gv213, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc130) + gv214: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv214, R.dtype("float16")) + _129: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape460, alloc131) + R.vm.kill_object(reshape460) + gv215: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape461: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc131, gv215, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc131) + gv216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape462: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape461, gv216, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape461) + model_decoder_layers_7_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] + model_decoder_layers_7_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[663] + gv217: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv217, R.dtype("float16")) + _130: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_out_proj_weight2, reshape462, model_decoder_layers_7_self_attn_out_proj_bias2, alloc132) + R.vm.kill_object(reshape462) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias2) + gv218: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv218, R.dtype("float16")) + cls.add5(alloc125, alloc132, alloc133) + R.vm.kill_object(alloc125) + R.vm.kill_object(alloc132) + model_decoder_layers_7_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[673] + model_decoder_layers_7_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[674] + gv219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv219, R.dtype("float16")) + cls.layer_norm2(alloc133, model_decoder_layers_7_encoder_attn_layer_norm_weight2, model_decoder_layers_7_encoder_attn_layer_norm_bias2, alloc134) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias2) + model_decoder_layers_7_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] + model_decoder_layers_7_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[670] + gv220: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc135: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv220, R.dtype("float16")) + _133: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight2, alloc134, model_decoder_layers_7_encoder_attn_q_proj_bias2, alloc135) + R.vm.kill_object(alloc134) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias2) + gv221: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape463: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc135, gv221, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc135) + gv222: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape464: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape463, gv222, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape463) + gv223: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv223, R.dtype("float16")) + _134: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape464, alloc136) + R.vm.kill_object(reshape464) + gv224: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape465: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc136, gv224, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc136) + gv225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape466: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape465, gv225, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape465) + model_decoder_layers_7_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] + model_decoder_layers_7_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[672] + gv226: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv226, R.dtype("float16")) + _135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight2, reshape466, model_decoder_layers_7_encoder_attn_out_proj_bias2, alloc137) + R.vm.kill_object(reshape466) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias2) + gv227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv227, R.dtype("float16")) + cls.add5(alloc133, alloc137, alloc138) + R.vm.kill_object(alloc133) + R.vm.kill_object(alloc137) + model_decoder_layers_7_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[679] + model_decoder_layers_7_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[680] + gv228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv228, R.dtype("float16")) + cls.layer_norm2(alloc138, model_decoder_layers_7_final_layer_norm_weight2, model_decoder_layers_7_final_layer_norm_bias2, alloc139) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias2) + model_decoder_layers_7_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] + model_decoder_layers_7_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[676] + gv229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv229, R.dtype("float16")) + _138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_7_fc1_weight2, alloc139, model_decoder_layers_7_fc1_bias2, alloc140) + R.vm.kill_object(alloc139) + R.vm.kill_object(model_decoder_layers_7_fc1_weight2) + R.vm.kill_object(model_decoder_layers_7_fc1_bias2) + model_decoder_layers_7_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] + model_decoder_layers_7_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[678] + gv230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv230, R.dtype("float16")) + _139: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_7_fc2_weight2, alloc140, model_decoder_layers_7_fc2_bias2, alloc141) + R.vm.kill_object(alloc140) + R.vm.kill_object(model_decoder_layers_7_fc2_weight2) + R.vm.kill_object(model_decoder_layers_7_fc2_bias2) + gv231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc142: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv231, R.dtype("float16")) + cls.add5(alloc138, alloc141, alloc142) + R.vm.kill_object(alloc138) + R.vm.kill_object(alloc141) + model_decoder_layers_8_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[688] + model_decoder_layers_8_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[689] + gv232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv232, R.dtype("float16")) + cls.layer_norm2(alloc142, model_decoder_layers_8_self_attn_layer_norm_weight2, model_decoder_layers_8_self_attn_layer_norm_bias2, alloc143) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias2) + model_decoder_layers_8_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] + model_decoder_layers_8_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[685] + gv233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv233, R.dtype("float16")) + _142: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_q_proj_weight2, alloc143, model_decoder_layers_8_self_attn_q_proj_bias2, alloc144) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias2) + gv234: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape467: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc144, gv234, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc144) + model_decoder_layers_8_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] + gv235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv235, R.dtype("float16")) + _143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_8_self_attn_k_proj_weight2, alloc143, alloc145) + R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight2) + gv236: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape468: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc145, gv236, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc145) + model_decoder_layers_8_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] + model_decoder_layers_8_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[683] + gv237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv237, R.dtype("float16")) + _144: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_v_proj_weight2, alloc143, model_decoder_layers_8_self_attn_v_proj_bias2, alloc146) + R.vm.kill_object(alloc143) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias2) + gv238: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape469: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc146, gv238, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc146) + gv239: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc147: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv239, R.dtype("float16")) + cls.concatenate1(reshape467, reshape468, reshape469, alloc147) + R.vm.kill_object(reshape467) + R.vm.kill_object(reshape468) + R.vm.kill_object(reshape469) + gv240: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape470: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc147, gv240, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc147) + gv241: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv241, R.dtype("float16")) + _146: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape470, alloc148) + R.vm.kill_object(reshape470) + gv242: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape471: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc148, gv242, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc148) + gv243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape472: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape471, gv243, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape471) + model_decoder_layers_8_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] + model_decoder_layers_8_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[687] + gv244: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv244, R.dtype("float16")) + _147: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_out_proj_weight2, reshape472, model_decoder_layers_8_self_attn_out_proj_bias2, alloc149) + R.vm.kill_object(reshape472) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias2) + gv245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv245, R.dtype("float16")) + cls.add5(alloc142, alloc149, alloc150) + R.vm.kill_object(alloc142) + R.vm.kill_object(alloc149) + model_decoder_layers_8_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[697] + model_decoder_layers_8_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[698] + gv246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv246, R.dtype("float16")) + cls.layer_norm2(alloc150, model_decoder_layers_8_encoder_attn_layer_norm_weight2, model_decoder_layers_8_encoder_attn_layer_norm_bias2, alloc151) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias2) + model_decoder_layers_8_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] + model_decoder_layers_8_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[694] + gv247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc152: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv247, R.dtype("float16")) + _150: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight2, alloc151, model_decoder_layers_8_encoder_attn_q_proj_bias2, alloc152) + R.vm.kill_object(alloc151) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias2) + gv248: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape473: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc152, gv248, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc152) + gv249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape474: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape473, gv249, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape473) + gv250: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv250, R.dtype("float16")) + _151: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape474, alloc153) + R.vm.kill_object(reshape474) + gv251: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape475: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc153, gv251, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc153) + gv252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape476: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape475, gv252, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape475) + model_decoder_layers_8_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] + model_decoder_layers_8_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[696] + gv253: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv253, R.dtype("float16")) + _152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight2, reshape476, model_decoder_layers_8_encoder_attn_out_proj_bias2, alloc154) + R.vm.kill_object(reshape476) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias2) + gv254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv254, R.dtype("float16")) + cls.add5(alloc150, alloc154, alloc155) + R.vm.kill_object(alloc150) + R.vm.kill_object(alloc154) + model_decoder_layers_8_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[703] + model_decoder_layers_8_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[704] + gv255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv255, R.dtype("float16")) + cls.layer_norm2(alloc155, model_decoder_layers_8_final_layer_norm_weight2, model_decoder_layers_8_final_layer_norm_bias2, alloc156) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias2) + model_decoder_layers_8_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] + model_decoder_layers_8_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[700] + gv256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv256, R.dtype("float16")) + _155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_8_fc1_weight2, alloc156, model_decoder_layers_8_fc1_bias2, alloc157) + R.vm.kill_object(alloc156) + R.vm.kill_object(model_decoder_layers_8_fc1_weight2) + R.vm.kill_object(model_decoder_layers_8_fc1_bias2) + model_decoder_layers_8_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] + model_decoder_layers_8_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[702] + gv257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv257, R.dtype("float16")) + _156: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_8_fc2_weight2, alloc157, model_decoder_layers_8_fc2_bias2, alloc158) + R.vm.kill_object(alloc157) + R.vm.kill_object(model_decoder_layers_8_fc2_weight2) + R.vm.kill_object(model_decoder_layers_8_fc2_bias2) + gv258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc159: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv258, R.dtype("float16")) + cls.add5(alloc155, alloc158, alloc159) + R.vm.kill_object(alloc155) + R.vm.kill_object(alloc158) + model_decoder_layers_9_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[712] + model_decoder_layers_9_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[713] + gv259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv259, R.dtype("float16")) + cls.layer_norm2(alloc159, model_decoder_layers_9_self_attn_layer_norm_weight2, model_decoder_layers_9_self_attn_layer_norm_bias2, alloc160) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias2) + model_decoder_layers_9_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] + model_decoder_layers_9_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[709] + gv260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv260, R.dtype("float16")) + _159: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_q_proj_weight2, alloc160, model_decoder_layers_9_self_attn_q_proj_bias2, alloc161) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias2) + gv261: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape477: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc161, gv261, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc161) + model_decoder_layers_9_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] + gv262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv262, R.dtype("float16")) + _160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_9_self_attn_k_proj_weight2, alloc160, alloc162) + R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight2) + gv263: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape478: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc162, gv263, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc162) + model_decoder_layers_9_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] + model_decoder_layers_9_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[707] + gv264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv264, R.dtype("float16")) + _161: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_v_proj_weight2, alloc160, model_decoder_layers_9_self_attn_v_proj_bias2, alloc163) + R.vm.kill_object(alloc160) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias2) + gv265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape479: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc163, gv265, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc163) + gv266: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc164: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv266, R.dtype("float16")) + cls.concatenate1(reshape477, reshape478, reshape479, alloc164) + R.vm.kill_object(reshape477) + R.vm.kill_object(reshape478) + R.vm.kill_object(reshape479) + gv267: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape480: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc164, gv267, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc164) + gv268: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv268, R.dtype("float16")) + _163: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape480, alloc165) + R.vm.kill_object(reshape480) + gv269: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape481: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc165, gv269, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc165) + gv270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape482: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape481, gv270, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape481) + model_decoder_layers_9_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] + model_decoder_layers_9_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[711] + gv271: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc166: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv271, R.dtype("float16")) + _164: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_out_proj_weight2, reshape482, model_decoder_layers_9_self_attn_out_proj_bias2, alloc166) + R.vm.kill_object(reshape482) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias2) + gv272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc167: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv272, R.dtype("float16")) + cls.add5(alloc159, alloc166, alloc167) + R.vm.kill_object(alloc159) + R.vm.kill_object(alloc166) + model_decoder_layers_9_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[721] + model_decoder_layers_9_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[722] + gv273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc168: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv273, R.dtype("float16")) + cls.layer_norm2(alloc167, model_decoder_layers_9_encoder_attn_layer_norm_weight2, model_decoder_layers_9_encoder_attn_layer_norm_bias2, alloc168) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias2) + model_decoder_layers_9_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] + model_decoder_layers_9_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[718] + gv274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc169: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv274, R.dtype("float16")) + _167: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight2, alloc168, model_decoder_layers_9_encoder_attn_q_proj_bias2, alloc169) + R.vm.kill_object(alloc168) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias2) + gv275: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape483: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc169, gv275, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc169) + gv276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape484: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape483, gv276, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape483) + gv277: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc170: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv277, R.dtype("float16")) + _168: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape484, alloc170) + R.vm.kill_object(reshape484) + gv278: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape485: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc170, gv278, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc170) + gv279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape486: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape485, gv279, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape485) + model_decoder_layers_9_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] + model_decoder_layers_9_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[720] + gv280: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc171: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv280, R.dtype("float16")) + _169: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight2, reshape486, model_decoder_layers_9_encoder_attn_out_proj_bias2, alloc171) + R.vm.kill_object(reshape486) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias2) + gv281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc172: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv281, R.dtype("float16")) + cls.add5(alloc167, alloc171, alloc172) + R.vm.kill_object(alloc167) + R.vm.kill_object(alloc171) + model_decoder_layers_9_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[727] + model_decoder_layers_9_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[728] + gv282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc173: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv282, R.dtype("float16")) + cls.layer_norm2(alloc172, model_decoder_layers_9_final_layer_norm_weight2, model_decoder_layers_9_final_layer_norm_bias2, alloc173) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias2) + model_decoder_layers_9_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] + model_decoder_layers_9_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[724] + gv283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc174: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv283, R.dtype("float16")) + _172: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_9_fc1_weight2, alloc173, model_decoder_layers_9_fc1_bias2, alloc174) + R.vm.kill_object(alloc173) + R.vm.kill_object(model_decoder_layers_9_fc1_weight2) + R.vm.kill_object(model_decoder_layers_9_fc1_bias2) + model_decoder_layers_9_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] + model_decoder_layers_9_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[726] + gv284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc175: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv284, R.dtype("float16")) + _173: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_9_fc2_weight2, alloc174, model_decoder_layers_9_fc2_bias2, alloc175) + R.vm.kill_object(alloc174) + R.vm.kill_object(model_decoder_layers_9_fc2_weight2) + R.vm.kill_object(model_decoder_layers_9_fc2_bias2) + gv285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc176: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv285, R.dtype("float16")) + cls.add5(alloc172, alloc175, alloc176) + R.vm.kill_object(alloc172) + R.vm.kill_object(alloc175) + model_decoder_layers_10_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[736] + model_decoder_layers_10_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[737] + gv286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc177: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv286, R.dtype("float16")) + cls.layer_norm2(alloc176, model_decoder_layers_10_self_attn_layer_norm_weight2, model_decoder_layers_10_self_attn_layer_norm_bias2, alloc177) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias2) + model_decoder_layers_10_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] + model_decoder_layers_10_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[733] + gv287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc178: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv287, R.dtype("float16")) + _176: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_q_proj_weight2, alloc177, model_decoder_layers_10_self_attn_q_proj_bias2, alloc178) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias2) + gv288: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape487: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc178, gv288, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc178) + model_decoder_layers_10_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] + gv289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc179: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv289, R.dtype("float16")) + _177: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_10_self_attn_k_proj_weight2, alloc177, alloc179) + R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight2) + gv290: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape488: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc179, gv290, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc179) + model_decoder_layers_10_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] + model_decoder_layers_10_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[731] + gv291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc180: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv291, R.dtype("float16")) + _178: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_v_proj_weight2, alloc177, model_decoder_layers_10_self_attn_v_proj_bias2, alloc180) + R.vm.kill_object(alloc177) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias2) + gv292: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape489: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc180, gv292, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc180) + gv293: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc181: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv293, R.dtype("float16")) + cls.concatenate1(reshape487, reshape488, reshape489, alloc181) + R.vm.kill_object(reshape487) + R.vm.kill_object(reshape488) + R.vm.kill_object(reshape489) + gv294: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape490: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc181, gv294, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc181) + gv295: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc182: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv295, R.dtype("float16")) + _180: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape490, alloc182) + R.vm.kill_object(reshape490) + gv296: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape491: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc182, gv296, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc182) + gv297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape492: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape491, gv297, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape491) + model_decoder_layers_10_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] + model_decoder_layers_10_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[735] + gv298: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc183: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv298, R.dtype("float16")) + _181: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_out_proj_weight2, reshape492, model_decoder_layers_10_self_attn_out_proj_bias2, alloc183) + R.vm.kill_object(reshape492) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias2) + gv299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc184: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv299, R.dtype("float16")) + cls.add5(alloc176, alloc183, alloc184) + R.vm.kill_object(alloc176) + R.vm.kill_object(alloc183) + model_decoder_layers_10_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[745] + model_decoder_layers_10_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[746] + gv300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc185: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv300, R.dtype("float16")) + cls.layer_norm2(alloc184, model_decoder_layers_10_encoder_attn_layer_norm_weight2, model_decoder_layers_10_encoder_attn_layer_norm_bias2, alloc185) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias2) + model_decoder_layers_10_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] + model_decoder_layers_10_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[742] + gv301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc186: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv301, R.dtype("float16")) + _184: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight2, alloc185, model_decoder_layers_10_encoder_attn_q_proj_bias2, alloc186) + R.vm.kill_object(alloc185) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias2) + gv302: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape493: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc186, gv302, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc186) + gv303: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape494: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape493, gv303, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape493) + gv304: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc187: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv304, R.dtype("float16")) + _185: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape494, alloc187) + R.vm.kill_object(reshape494) + gv305: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape495: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc187, gv305, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc187) + gv306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape496: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape495, gv306, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape495) + model_decoder_layers_10_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] + model_decoder_layers_10_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[744] + gv307: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc188: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv307, R.dtype("float16")) + _186: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight2, reshape496, model_decoder_layers_10_encoder_attn_out_proj_bias2, alloc188) + R.vm.kill_object(reshape496) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias2) + gv308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc189: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv308, R.dtype("float16")) + cls.add5(alloc184, alloc188, alloc189) + R.vm.kill_object(alloc184) + R.vm.kill_object(alloc188) + model_decoder_layers_10_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[751] + model_decoder_layers_10_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[752] + gv309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc190: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv309, R.dtype("float16")) + cls.layer_norm2(alloc189, model_decoder_layers_10_final_layer_norm_weight2, model_decoder_layers_10_final_layer_norm_bias2, alloc190) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias2) + model_decoder_layers_10_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] + model_decoder_layers_10_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[748] + gv310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc191: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv310, R.dtype("float16")) + _189: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_10_fc1_weight2, alloc190, model_decoder_layers_10_fc1_bias2, alloc191) + R.vm.kill_object(alloc190) + R.vm.kill_object(model_decoder_layers_10_fc1_weight2) + R.vm.kill_object(model_decoder_layers_10_fc1_bias2) + model_decoder_layers_10_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] + model_decoder_layers_10_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[750] + gv311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc192: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv311, R.dtype("float16")) + _190: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_10_fc2_weight2, alloc191, model_decoder_layers_10_fc2_bias2, alloc192) + R.vm.kill_object(alloc191) + R.vm.kill_object(model_decoder_layers_10_fc2_weight2) + R.vm.kill_object(model_decoder_layers_10_fc2_bias2) + gv312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc193: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv312, R.dtype("float16")) + cls.add5(alloc189, alloc192, alloc193) + R.vm.kill_object(alloc189) + R.vm.kill_object(alloc192) + model_decoder_layers_11_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[760] + model_decoder_layers_11_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[761] + gv313: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc194: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv313, R.dtype("float16")) + cls.layer_norm2(alloc193, model_decoder_layers_11_self_attn_layer_norm_weight2, model_decoder_layers_11_self_attn_layer_norm_bias2, alloc194) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias2) + model_decoder_layers_11_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] + model_decoder_layers_11_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[757] + gv314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc195: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv314, R.dtype("float16")) + _193: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_q_proj_weight2, alloc194, model_decoder_layers_11_self_attn_q_proj_bias2, alloc195) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias2) + gv315: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape497: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc195, gv315, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc195) + model_decoder_layers_11_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] + gv316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc196: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv316, R.dtype("float16")) + _194: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_11_self_attn_k_proj_weight2, alloc194, alloc196) + R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight2) + gv317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape498: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc196, gv317, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc196) + model_decoder_layers_11_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] + model_decoder_layers_11_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[755] + gv318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc197: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv318, R.dtype("float16")) + _195: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_v_proj_weight2, alloc194, model_decoder_layers_11_self_attn_v_proj_bias2, alloc197) + R.vm.kill_object(alloc194) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias2) + gv319: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape499: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc197, gv319, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc197) + gv320: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc198: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv320, R.dtype("float16")) + cls.concatenate1(reshape497, reshape498, reshape499, alloc198) + R.vm.kill_object(reshape497) + R.vm.kill_object(reshape498) + R.vm.kill_object(reshape499) + gv321: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape500: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc198, gv321, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc198) + gv322: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc199: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv322, R.dtype("float16")) + _197: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape500, alloc199) + R.vm.kill_object(reshape500) + gv323: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape501: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc199, gv323, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc199) + gv324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape502: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape501, gv324, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape501) + model_decoder_layers_11_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] + model_decoder_layers_11_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[759] + gv325: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc200: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv325, R.dtype("float16")) + _198: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_out_proj_weight2, reshape502, model_decoder_layers_11_self_attn_out_proj_bias2, alloc200) + R.vm.kill_object(reshape502) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias2) + gv326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc201: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv326, R.dtype("float16")) + cls.add5(alloc193, alloc200, alloc201) + R.vm.kill_object(alloc193) + R.vm.kill_object(alloc200) + model_decoder_layers_11_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[769] + model_decoder_layers_11_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[770] + gv327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc202: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv327, R.dtype("float16")) + cls.layer_norm2(alloc201, model_decoder_layers_11_encoder_attn_layer_norm_weight2, model_decoder_layers_11_encoder_attn_layer_norm_bias2, alloc202) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias2) + model_decoder_layers_11_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] + model_decoder_layers_11_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[766] + gv328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc203: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv328, R.dtype("float16")) + _201: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight2, alloc202, model_decoder_layers_11_encoder_attn_q_proj_bias2, alloc203) + R.vm.kill_object(alloc202) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias2) + gv329: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape503: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc203, gv329, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc203) + gv330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape504: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape503, gv330, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape503) + gv331: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc204: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv331, R.dtype("float16")) + _202: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape504, alloc204) + R.vm.kill_object(reshape504) + gv332: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape505: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc204, gv332, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc204) + gv333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape506: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape505, gv333, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape505) + model_decoder_layers_11_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] + model_decoder_layers_11_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[768] + gv334: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc205: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv334, R.dtype("float16")) + _203: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight2, reshape506, model_decoder_layers_11_encoder_attn_out_proj_bias2, alloc205) + R.vm.kill_object(reshape506) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias2) + gv335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc206: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv335, R.dtype("float16")) + cls.add5(alloc201, alloc205, alloc206) + R.vm.kill_object(alloc201) + R.vm.kill_object(alloc205) + model_decoder_layers_11_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[775] + model_decoder_layers_11_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[776] + gv336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc207: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv336, R.dtype("float16")) + cls.layer_norm2(alloc206, model_decoder_layers_11_final_layer_norm_weight2, model_decoder_layers_11_final_layer_norm_bias2, alloc207) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias2) + model_decoder_layers_11_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] + model_decoder_layers_11_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[772] + gv337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc208: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv337, R.dtype("float16")) + _206: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_11_fc1_weight2, alloc207, model_decoder_layers_11_fc1_bias2, alloc208) + R.vm.kill_object(alloc207) + R.vm.kill_object(model_decoder_layers_11_fc1_weight2) + R.vm.kill_object(model_decoder_layers_11_fc1_bias2) + model_decoder_layers_11_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] + model_decoder_layers_11_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[774] + gv338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc209: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv338, R.dtype("float16")) + _207: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_11_fc2_weight2, alloc208, model_decoder_layers_11_fc2_bias2, alloc209) + R.vm.kill_object(alloc208) + R.vm.kill_object(model_decoder_layers_11_fc2_weight2) + R.vm.kill_object(model_decoder_layers_11_fc2_bias2) + gv339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc210: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv339, R.dtype("float16")) + cls.add5(alloc206, alloc209, alloc210) + R.vm.kill_object(alloc206) + R.vm.kill_object(alloc209) + model_decoder_layers_12_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[784] + model_decoder_layers_12_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[785] + gv340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc211: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv340, R.dtype("float16")) + cls.layer_norm2(alloc210, model_decoder_layers_12_self_attn_layer_norm_weight2, model_decoder_layers_12_self_attn_layer_norm_bias2, alloc211) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias2) + model_decoder_layers_12_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] + model_decoder_layers_12_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[781] + gv341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc212: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv341, R.dtype("float16")) + _210: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_q_proj_weight2, alloc211, model_decoder_layers_12_self_attn_q_proj_bias2, alloc212) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias2) + gv342: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape507: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc212, gv342, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc212) + model_decoder_layers_12_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] + gv343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc213: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv343, R.dtype("float16")) + _211: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_12_self_attn_k_proj_weight2, alloc211, alloc213) + R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight2) + gv344: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape508: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc213, gv344, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc213) + model_decoder_layers_12_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] + model_decoder_layers_12_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[779] + gv345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc214: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv345, R.dtype("float16")) + _212: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_v_proj_weight2, alloc211, model_decoder_layers_12_self_attn_v_proj_bias2, alloc214) + R.vm.kill_object(alloc211) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias2) + gv346: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape509: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc214, gv346, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc214) + gv347: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc215: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv347, R.dtype("float16")) + cls.concatenate1(reshape507, reshape508, reshape509, alloc215) + R.vm.kill_object(reshape507) + R.vm.kill_object(reshape508) + R.vm.kill_object(reshape509) + gv348: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape510: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc215, gv348, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc215) + gv349: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc216: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv349, R.dtype("float16")) + _214: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape510, alloc216) + R.vm.kill_object(reshape510) + gv350: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape511: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc216, gv350, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc216) + gv351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape512: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape511, gv351, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape511) + model_decoder_layers_12_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] + model_decoder_layers_12_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[783] + gv352: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc217: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv352, R.dtype("float16")) + _215: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_out_proj_weight2, reshape512, model_decoder_layers_12_self_attn_out_proj_bias2, alloc217) + R.vm.kill_object(reshape512) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias2) + gv353: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc218: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv353, R.dtype("float16")) + cls.add5(alloc210, alloc217, alloc218) + R.vm.kill_object(alloc210) + R.vm.kill_object(alloc217) + model_decoder_layers_12_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[793] + model_decoder_layers_12_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[794] + gv354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc219: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv354, R.dtype("float16")) + cls.layer_norm2(alloc218, model_decoder_layers_12_encoder_attn_layer_norm_weight2, model_decoder_layers_12_encoder_attn_layer_norm_bias2, alloc219) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias2) + model_decoder_layers_12_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] + model_decoder_layers_12_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[790] + gv355: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc220: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv355, R.dtype("float16")) + _218: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight2, alloc219, model_decoder_layers_12_encoder_attn_q_proj_bias2, alloc220) + R.vm.kill_object(alloc219) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias2) + gv356: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape513: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc220, gv356, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc220) + gv357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape514: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape513, gv357, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape513) + gv358: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc221: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv358, R.dtype("float16")) + _219: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape514, alloc221) + R.vm.kill_object(reshape514) + gv359: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape515: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc221, gv359, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc221) + gv360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape516: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape515, gv360, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape515) + model_decoder_layers_12_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] + model_decoder_layers_12_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[792] + gv361: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc222: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv361, R.dtype("float16")) + _220: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight2, reshape516, model_decoder_layers_12_encoder_attn_out_proj_bias2, alloc222) + R.vm.kill_object(reshape516) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias2) + gv362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc223: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv362, R.dtype("float16")) + cls.add5(alloc218, alloc222, alloc223) + R.vm.kill_object(alloc218) + R.vm.kill_object(alloc222) + model_decoder_layers_12_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[799] + model_decoder_layers_12_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[800] + gv363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc224: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv363, R.dtype("float16")) + cls.layer_norm2(alloc223, model_decoder_layers_12_final_layer_norm_weight2, model_decoder_layers_12_final_layer_norm_bias2, alloc224) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias2) + model_decoder_layers_12_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] + model_decoder_layers_12_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[796] + gv364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc225: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv364, R.dtype("float16")) + _223: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_12_fc1_weight2, alloc224, model_decoder_layers_12_fc1_bias2, alloc225) + R.vm.kill_object(alloc224) + R.vm.kill_object(model_decoder_layers_12_fc1_weight2) + R.vm.kill_object(model_decoder_layers_12_fc1_bias2) + model_decoder_layers_12_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] + model_decoder_layers_12_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[798] + gv365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc226: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv365, R.dtype("float16")) + _224: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_12_fc2_weight2, alloc225, model_decoder_layers_12_fc2_bias2, alloc226) + R.vm.kill_object(alloc225) + R.vm.kill_object(model_decoder_layers_12_fc2_weight2) + R.vm.kill_object(model_decoder_layers_12_fc2_bias2) + gv366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc227: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv366, R.dtype("float16")) + cls.add5(alloc223, alloc226, alloc227) + R.vm.kill_object(alloc223) + R.vm.kill_object(alloc226) + model_decoder_layers_13_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[808] + model_decoder_layers_13_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[809] + gv367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc228: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv367, R.dtype("float16")) + cls.layer_norm2(alloc227, model_decoder_layers_13_self_attn_layer_norm_weight2, model_decoder_layers_13_self_attn_layer_norm_bias2, alloc228) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias2) + model_decoder_layers_13_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] + model_decoder_layers_13_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[805] + gv368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc229: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv368, R.dtype("float16")) + _227: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_q_proj_weight2, alloc228, model_decoder_layers_13_self_attn_q_proj_bias2, alloc229) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias2) + gv369: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape517: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc229, gv369, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc229) + model_decoder_layers_13_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] + gv370: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc230: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv370, R.dtype("float16")) + _228: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_13_self_attn_k_proj_weight2, alloc228, alloc230) + R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight2) + gv371: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape518: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc230, gv371, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc230) + model_decoder_layers_13_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] + model_decoder_layers_13_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[803] + gv372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc231: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv372, R.dtype("float16")) + _229: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_v_proj_weight2, alloc228, model_decoder_layers_13_self_attn_v_proj_bias2, alloc231) + R.vm.kill_object(alloc228) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias2) + gv373: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape519: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc231, gv373, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc231) + gv374: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc232: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv374, R.dtype("float16")) + cls.concatenate1(reshape517, reshape518, reshape519, alloc232) + R.vm.kill_object(reshape517) + R.vm.kill_object(reshape518) + R.vm.kill_object(reshape519) + gv375: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape520: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc232, gv375, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc232) + gv376: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc233: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv376, R.dtype("float16")) + _231: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape520, alloc233) + R.vm.kill_object(reshape520) + gv377: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape521: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc233, gv377, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc233) + gv378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape522: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape521, gv378, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape521) + model_decoder_layers_13_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] + model_decoder_layers_13_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[807] + gv379: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc234: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv379, R.dtype("float16")) + _232: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_out_proj_weight2, reshape522, model_decoder_layers_13_self_attn_out_proj_bias2, alloc234) + R.vm.kill_object(reshape522) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias2) + gv380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc235: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv380, R.dtype("float16")) + cls.add5(alloc227, alloc234, alloc235) + R.vm.kill_object(alloc227) + R.vm.kill_object(alloc234) + model_decoder_layers_13_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[817] + model_decoder_layers_13_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[818] + gv381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc236: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv381, R.dtype("float16")) + cls.layer_norm2(alloc235, model_decoder_layers_13_encoder_attn_layer_norm_weight2, model_decoder_layers_13_encoder_attn_layer_norm_bias2, alloc236) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias2) + model_decoder_layers_13_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] + model_decoder_layers_13_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[814] + gv382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc237: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv382, R.dtype("float16")) + _235: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight2, alloc236, model_decoder_layers_13_encoder_attn_q_proj_bias2, alloc237) + R.vm.kill_object(alloc236) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias2) + gv383: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape523: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc237, gv383, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc237) + gv384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape524: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape523, gv384, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape523) + gv385: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc238: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv385, R.dtype("float16")) + _236: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape524, alloc238) + R.vm.kill_object(reshape524) + gv386: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape525: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc238, gv386, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc238) + gv387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape526: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape525, gv387, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape525) + model_decoder_layers_13_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] + model_decoder_layers_13_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[816] + gv388: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc239: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv388, R.dtype("float16")) + _237: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight2, reshape526, model_decoder_layers_13_encoder_attn_out_proj_bias2, alloc239) + R.vm.kill_object(reshape526) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias2) + gv389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc240: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv389, R.dtype("float16")) + cls.add5(alloc235, alloc239, alloc240) + R.vm.kill_object(alloc235) + R.vm.kill_object(alloc239) + model_decoder_layers_13_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[823] + model_decoder_layers_13_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[824] + gv390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc241: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv390, R.dtype("float16")) + cls.layer_norm2(alloc240, model_decoder_layers_13_final_layer_norm_weight2, model_decoder_layers_13_final_layer_norm_bias2, alloc241) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias2) + model_decoder_layers_13_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] + model_decoder_layers_13_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[820] + gv391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc242: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv391, R.dtype("float16")) + _240: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_13_fc1_weight2, alloc241, model_decoder_layers_13_fc1_bias2, alloc242) + R.vm.kill_object(alloc241) + R.vm.kill_object(model_decoder_layers_13_fc1_weight2) + R.vm.kill_object(model_decoder_layers_13_fc1_bias2) + model_decoder_layers_13_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] + model_decoder_layers_13_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[822] + gv392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc243: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv392, R.dtype("float16")) + _241: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_13_fc2_weight2, alloc242, model_decoder_layers_13_fc2_bias2, alloc243) + R.vm.kill_object(alloc242) + R.vm.kill_object(model_decoder_layers_13_fc2_weight2) + R.vm.kill_object(model_decoder_layers_13_fc2_bias2) + gv393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc244: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv393, R.dtype("float16")) + cls.add5(alloc240, alloc243, alloc244) + R.vm.kill_object(alloc240) + R.vm.kill_object(alloc243) + model_decoder_layers_14_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[832] + model_decoder_layers_14_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[833] + gv394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc245: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv394, R.dtype("float16")) + cls.layer_norm2(alloc244, model_decoder_layers_14_self_attn_layer_norm_weight2, model_decoder_layers_14_self_attn_layer_norm_bias2, alloc245) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias2) + model_decoder_layers_14_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] + model_decoder_layers_14_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[829] + gv395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc246: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv395, R.dtype("float16")) + _244: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_q_proj_weight2, alloc245, model_decoder_layers_14_self_attn_q_proj_bias2, alloc246) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias2) + gv396: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape527: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc246, gv396, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc246) + model_decoder_layers_14_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] + gv397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc247: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv397, R.dtype("float16")) + _245: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_14_self_attn_k_proj_weight2, alloc245, alloc247) + R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight2) + gv398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape528: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc247, gv398, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc247) + model_decoder_layers_14_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] + model_decoder_layers_14_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[827] + gv399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc248: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv399, R.dtype("float16")) + _246: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_v_proj_weight2, alloc245, model_decoder_layers_14_self_attn_v_proj_bias2, alloc248) + R.vm.kill_object(alloc245) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias2) + gv400: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape529: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc248, gv400, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc248) + gv401: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc249: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv401, R.dtype("float16")) + cls.concatenate1(reshape527, reshape528, reshape529, alloc249) + R.vm.kill_object(reshape527) + R.vm.kill_object(reshape528) + R.vm.kill_object(reshape529) + gv402: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape530: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc249, gv402, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc249) + gv403: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc250: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv403, R.dtype("float16")) + _248: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape530, alloc250) + R.vm.kill_object(reshape530) + gv404: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape531: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc250, gv404, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc250) + gv405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape532: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape531, gv405, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape531) + model_decoder_layers_14_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] + model_decoder_layers_14_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[831] + gv406: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc251: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv406, R.dtype("float16")) + _249: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_out_proj_weight2, reshape532, model_decoder_layers_14_self_attn_out_proj_bias2, alloc251) + R.vm.kill_object(reshape532) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias2) + gv407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc252: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv407, R.dtype("float16")) + cls.add5(alloc244, alloc251, alloc252) + R.vm.kill_object(alloc244) + R.vm.kill_object(alloc251) + model_decoder_layers_14_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[841] + model_decoder_layers_14_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[842] + gv408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc253: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv408, R.dtype("float16")) + cls.layer_norm2(alloc252, model_decoder_layers_14_encoder_attn_layer_norm_weight2, model_decoder_layers_14_encoder_attn_layer_norm_bias2, alloc253) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias2) + model_decoder_layers_14_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] + model_decoder_layers_14_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[838] + gv409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc254: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv409, R.dtype("float16")) + _252: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight2, alloc253, model_decoder_layers_14_encoder_attn_q_proj_bias2, alloc254) + R.vm.kill_object(alloc253) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias2) + gv410: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape533: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc254, gv410, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc254) + gv411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape534: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape533, gv411, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape533) + gv412: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc255: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv412, R.dtype("float16")) + _253: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape534, alloc255) + R.vm.kill_object(reshape534) + gv413: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape535: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc255, gv413, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc255) + gv414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape536: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape535, gv414, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape535) + model_decoder_layers_14_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] + model_decoder_layers_14_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[840] + gv415: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc256: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv415, R.dtype("float16")) + _254: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight2, reshape536, model_decoder_layers_14_encoder_attn_out_proj_bias2, alloc256) + R.vm.kill_object(reshape536) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias2) + gv416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc257: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv416, R.dtype("float16")) + cls.add5(alloc252, alloc256, alloc257) + R.vm.kill_object(alloc252) + R.vm.kill_object(alloc256) + model_decoder_layers_14_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[847] + model_decoder_layers_14_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[848] + gv417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc258: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv417, R.dtype("float16")) + cls.layer_norm2(alloc257, model_decoder_layers_14_final_layer_norm_weight2, model_decoder_layers_14_final_layer_norm_bias2, alloc258) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias2) + model_decoder_layers_14_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] + model_decoder_layers_14_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[844] + gv418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc259: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv418, R.dtype("float16")) + _257: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_14_fc1_weight2, alloc258, model_decoder_layers_14_fc1_bias2, alloc259) + R.vm.kill_object(alloc258) + R.vm.kill_object(model_decoder_layers_14_fc1_weight2) + R.vm.kill_object(model_decoder_layers_14_fc1_bias2) + model_decoder_layers_14_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] + model_decoder_layers_14_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[846] + gv419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc260: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv419, R.dtype("float16")) + _258: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_14_fc2_weight2, alloc259, model_decoder_layers_14_fc2_bias2, alloc260) + R.vm.kill_object(alloc259) + R.vm.kill_object(model_decoder_layers_14_fc2_weight2) + R.vm.kill_object(model_decoder_layers_14_fc2_bias2) + gv420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc261: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv420, R.dtype("float16")) + cls.add5(alloc257, alloc260, alloc261) + R.vm.kill_object(alloc257) + R.vm.kill_object(alloc260) + model_decoder_layers_15_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[856] + model_decoder_layers_15_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[857] + gv421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc262: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv421, R.dtype("float16")) + cls.layer_norm2(alloc261, model_decoder_layers_15_self_attn_layer_norm_weight2, model_decoder_layers_15_self_attn_layer_norm_bias2, alloc262) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias2) + model_decoder_layers_15_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] + model_decoder_layers_15_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[853] + gv422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc263: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv422, R.dtype("float16")) + _261: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_q_proj_weight2, alloc262, model_decoder_layers_15_self_attn_q_proj_bias2, alloc263) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias2) + gv423: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape537: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc263, gv423, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc263) + model_decoder_layers_15_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] + gv424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc264: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv424, R.dtype("float16")) + _262: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_15_self_attn_k_proj_weight2, alloc262, alloc264) + R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight2) + gv425: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape538: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc264, gv425, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc264) + model_decoder_layers_15_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] + model_decoder_layers_15_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[851] + gv426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc265: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv426, R.dtype("float16")) + _263: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_v_proj_weight2, alloc262, model_decoder_layers_15_self_attn_v_proj_bias2, alloc265) + R.vm.kill_object(alloc262) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias2) + gv427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape539: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc265, gv427, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc265) + gv428: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc266: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv428, R.dtype("float16")) + cls.concatenate1(reshape537, reshape538, reshape539, alloc266) + R.vm.kill_object(reshape537) + R.vm.kill_object(reshape538) + R.vm.kill_object(reshape539) + gv429: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape540: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc266, gv429, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc266) + gv430: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc267: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv430, R.dtype("float16")) + _265: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape540, alloc267) + R.vm.kill_object(reshape540) + gv431: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape541: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc267, gv431, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc267) + gv432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape542: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape541, gv432, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape541) + model_decoder_layers_15_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] + model_decoder_layers_15_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[855] + gv433: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc268: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv433, R.dtype("float16")) + _266: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_out_proj_weight2, reshape542, model_decoder_layers_15_self_attn_out_proj_bias2, alloc268) + R.vm.kill_object(reshape542) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias2) + gv434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc269: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv434, R.dtype("float16")) + cls.add5(alloc261, alloc268, alloc269) + R.vm.kill_object(alloc261) + R.vm.kill_object(alloc268) + model_decoder_layers_15_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[865] + model_decoder_layers_15_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[866] + gv435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc270: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv435, R.dtype("float16")) + cls.layer_norm2(alloc269, model_decoder_layers_15_encoder_attn_layer_norm_weight2, model_decoder_layers_15_encoder_attn_layer_norm_bias2, alloc270) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias2) + model_decoder_layers_15_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] + model_decoder_layers_15_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[862] + gv436: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc271: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv436, R.dtype("float16")) + _269: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight2, alloc270, model_decoder_layers_15_encoder_attn_q_proj_bias2, alloc271) + R.vm.kill_object(alloc270) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias2) + gv437: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape543: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc271, gv437, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc271) + gv438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape544: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape543, gv438, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape543) + gv439: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc272: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv439, R.dtype("float16")) + _270: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape544, alloc272) + R.vm.kill_object(reshape544) + gv440: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape545: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc272, gv440, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc272) + gv441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape546: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape545, gv441, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape545) + model_decoder_layers_15_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] + model_decoder_layers_15_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[864] + gv442: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc273: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv442, R.dtype("float16")) + _271: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight2, reshape546, model_decoder_layers_15_encoder_attn_out_proj_bias2, alloc273) + R.vm.kill_object(reshape546) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias2) + gv443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc274: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv443, R.dtype("float16")) + cls.add5(alloc269, alloc273, alloc274) + R.vm.kill_object(alloc269) + R.vm.kill_object(alloc273) + model_decoder_layers_15_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[871] + model_decoder_layers_15_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[872] + gv444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc275: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv444, R.dtype("float16")) + cls.layer_norm2(alloc274, model_decoder_layers_15_final_layer_norm_weight2, model_decoder_layers_15_final_layer_norm_bias2, alloc275) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias2) + model_decoder_layers_15_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] + model_decoder_layers_15_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[868] + gv445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc276: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv445, R.dtype("float16")) + _274: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_15_fc1_weight2, alloc275, model_decoder_layers_15_fc1_bias2, alloc276) + R.vm.kill_object(alloc275) + R.vm.kill_object(model_decoder_layers_15_fc1_weight2) + R.vm.kill_object(model_decoder_layers_15_fc1_bias2) + model_decoder_layers_15_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] + model_decoder_layers_15_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[870] + gv446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc277: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv446, R.dtype("float16")) + _275: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_15_fc2_weight2, alloc276, model_decoder_layers_15_fc2_bias2, alloc277) + R.vm.kill_object(alloc276) + R.vm.kill_object(model_decoder_layers_15_fc2_weight2) + R.vm.kill_object(model_decoder_layers_15_fc2_bias2) + gv447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc278: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv447, R.dtype("float16")) + cls.add5(alloc274, alloc277, alloc278) + R.vm.kill_object(alloc274) + R.vm.kill_object(alloc277) + model_decoder_layers_16_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[880] + model_decoder_layers_16_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[881] + gv448: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc279: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv448, R.dtype("float16")) + cls.layer_norm2(alloc278, model_decoder_layers_16_self_attn_layer_norm_weight2, model_decoder_layers_16_self_attn_layer_norm_bias2, alloc279) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias2) + model_decoder_layers_16_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] + model_decoder_layers_16_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[877] + gv449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc280: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv449, R.dtype("float16")) + _278: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_q_proj_weight2, alloc279, model_decoder_layers_16_self_attn_q_proj_bias2, alloc280) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias2) + gv450: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape547: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc280, gv450, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc280) + model_decoder_layers_16_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] + gv451: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc281: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv451, R.dtype("float16")) + _279: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_16_self_attn_k_proj_weight2, alloc279, alloc281) + R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight2) + gv452: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape548: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc281, gv452, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc281) + model_decoder_layers_16_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] + model_decoder_layers_16_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[875] + gv453: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc282: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv453, R.dtype("float16")) + _280: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_v_proj_weight2, alloc279, model_decoder_layers_16_self_attn_v_proj_bias2, alloc282) + R.vm.kill_object(alloc279) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias2) + gv454: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape549: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc282, gv454, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc282) + gv455: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc283: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv455, R.dtype("float16")) + cls.concatenate1(reshape547, reshape548, reshape549, alloc283) + R.vm.kill_object(reshape547) + R.vm.kill_object(reshape548) + R.vm.kill_object(reshape549) + gv456: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape550: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc283, gv456, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc283) + gv457: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc284: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv457, R.dtype("float16")) + _282: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape550, alloc284) + R.vm.kill_object(reshape550) + gv458: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape551: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc284, gv458, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc284) + gv459: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape552: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape551, gv459, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape551) + model_decoder_layers_16_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] + model_decoder_layers_16_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[879] + gv460: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc285: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv460, R.dtype("float16")) + _283: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_out_proj_weight2, reshape552, model_decoder_layers_16_self_attn_out_proj_bias2, alloc285) + R.vm.kill_object(reshape552) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias2) + gv461: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc286: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv461, R.dtype("float16")) + cls.add5(alloc278, alloc285, alloc286) + R.vm.kill_object(alloc278) + R.vm.kill_object(alloc285) + model_decoder_layers_16_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[889] + model_decoder_layers_16_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[890] + gv462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc287: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv462, R.dtype("float16")) + cls.layer_norm2(alloc286, model_decoder_layers_16_encoder_attn_layer_norm_weight2, model_decoder_layers_16_encoder_attn_layer_norm_bias2, alloc287) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias2) + model_decoder_layers_16_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] + model_decoder_layers_16_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[886] + gv463: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc288: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv463, R.dtype("float16")) + _286: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight2, alloc287, model_decoder_layers_16_encoder_attn_q_proj_bias2, alloc288) + R.vm.kill_object(alloc287) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias2) + gv464: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape553: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc288, gv464, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc288) + gv465: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape554: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape553, gv465, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape553) + gv466: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc289: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv466, R.dtype("float16")) + _287: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape554, alloc289) + R.vm.kill_object(reshape554) + gv467: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape555: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc289, gv467, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc289) + gv468: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape556: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape555, gv468, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape555) + model_decoder_layers_16_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] + model_decoder_layers_16_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[888] + gv469: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc290: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv469, R.dtype("float16")) + _288: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight2, reshape556, model_decoder_layers_16_encoder_attn_out_proj_bias2, alloc290) + R.vm.kill_object(reshape556) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias2) + gv470: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc291: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv470, R.dtype("float16")) + cls.add5(alloc286, alloc290, alloc291) + R.vm.kill_object(alloc286) + R.vm.kill_object(alloc290) + model_decoder_layers_16_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[895] + model_decoder_layers_16_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[896] + gv471: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc292: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv471, R.dtype("float16")) + cls.layer_norm2(alloc291, model_decoder_layers_16_final_layer_norm_weight2, model_decoder_layers_16_final_layer_norm_bias2, alloc292) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias2) + model_decoder_layers_16_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] + model_decoder_layers_16_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[892] + gv472: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc293: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv472, R.dtype("float16")) + _291: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_16_fc1_weight2, alloc292, model_decoder_layers_16_fc1_bias2, alloc293) + R.vm.kill_object(alloc292) + R.vm.kill_object(model_decoder_layers_16_fc1_weight2) + R.vm.kill_object(model_decoder_layers_16_fc1_bias2) + model_decoder_layers_16_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] + model_decoder_layers_16_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[894] + gv473: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc294: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv473, R.dtype("float16")) + _292: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_16_fc2_weight2, alloc293, model_decoder_layers_16_fc2_bias2, alloc294) + R.vm.kill_object(alloc293) + R.vm.kill_object(model_decoder_layers_16_fc2_weight2) + R.vm.kill_object(model_decoder_layers_16_fc2_bias2) + gv474: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc295: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv474, R.dtype("float16")) + cls.add5(alloc291, alloc294, alloc295) + R.vm.kill_object(alloc291) + R.vm.kill_object(alloc294) + model_decoder_layers_17_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[904] + model_decoder_layers_17_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[905] + gv475: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc296: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv475, R.dtype("float16")) + cls.layer_norm2(alloc295, model_decoder_layers_17_self_attn_layer_norm_weight2, model_decoder_layers_17_self_attn_layer_norm_bias2, alloc296) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias2) + model_decoder_layers_17_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] + model_decoder_layers_17_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[901] + gv476: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc297: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv476, R.dtype("float16")) + _295: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_q_proj_weight2, alloc296, model_decoder_layers_17_self_attn_q_proj_bias2, alloc297) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias2) + gv477: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape557: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc297, gv477, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc297) + model_decoder_layers_17_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] + gv478: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc298: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv478, R.dtype("float16")) + _296: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_17_self_attn_k_proj_weight2, alloc296, alloc298) + R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight2) + gv479: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape558: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc298, gv479, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc298) + model_decoder_layers_17_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] + model_decoder_layers_17_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[899] + gv480: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc299: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv480, R.dtype("float16")) + _297: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_v_proj_weight2, alloc296, model_decoder_layers_17_self_attn_v_proj_bias2, alloc299) + R.vm.kill_object(alloc296) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias2) + gv481: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape559: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc299, gv481, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc299) + gv482: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc300: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv482, R.dtype("float16")) + cls.concatenate1(reshape557, reshape558, reshape559, alloc300) + R.vm.kill_object(reshape557) + R.vm.kill_object(reshape558) + R.vm.kill_object(reshape559) + gv483: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape560: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc300, gv483, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc300) + gv484: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc301: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv484, R.dtype("float16")) + _299: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape560, alloc301) + R.vm.kill_object(reshape560) + gv485: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape561: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc301, gv485, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc301) + gv486: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape562: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape561, gv486, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape561) + model_decoder_layers_17_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] + model_decoder_layers_17_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[903] + gv487: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc302: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv487, R.dtype("float16")) + _300: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_out_proj_weight2, reshape562, model_decoder_layers_17_self_attn_out_proj_bias2, alloc302) + R.vm.kill_object(reshape562) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias2) + gv488: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc303: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv488, R.dtype("float16")) + cls.add5(alloc295, alloc302, alloc303) + R.vm.kill_object(alloc295) + R.vm.kill_object(alloc302) + model_decoder_layers_17_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[913] + model_decoder_layers_17_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[914] + gv489: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc304: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv489, R.dtype("float16")) + cls.layer_norm2(alloc303, model_decoder_layers_17_encoder_attn_layer_norm_weight2, model_decoder_layers_17_encoder_attn_layer_norm_bias2, alloc304) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias2) + model_decoder_layers_17_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] + model_decoder_layers_17_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[910] + gv490: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc305: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv490, R.dtype("float16")) + _303: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight2, alloc304, model_decoder_layers_17_encoder_attn_q_proj_bias2, alloc305) + R.vm.kill_object(alloc304) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias2) + gv491: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape563: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc305, gv491, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc305) + gv492: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape564: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape563, gv492, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape563) + gv493: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc306: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv493, R.dtype("float16")) + _304: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape564, alloc306) + R.vm.kill_object(reshape564) + gv494: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape565: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc306, gv494, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc306) + gv495: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape566: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape565, gv495, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape565) + model_decoder_layers_17_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] + model_decoder_layers_17_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[912] + gv496: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc307: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv496, R.dtype("float16")) + _305: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight2, reshape566, model_decoder_layers_17_encoder_attn_out_proj_bias2, alloc307) + R.vm.kill_object(reshape566) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias2) + gv497: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc308: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv497, R.dtype("float16")) + cls.add5(alloc303, alloc307, alloc308) + R.vm.kill_object(alloc303) + R.vm.kill_object(alloc307) + model_decoder_layers_17_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[919] + model_decoder_layers_17_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[920] + gv498: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc309: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv498, R.dtype("float16")) + cls.layer_norm2(alloc308, model_decoder_layers_17_final_layer_norm_weight2, model_decoder_layers_17_final_layer_norm_bias2, alloc309) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias2) + model_decoder_layers_17_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] + model_decoder_layers_17_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[916] + gv499: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc310: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv499, R.dtype("float16")) + _308: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_17_fc1_weight2, alloc309, model_decoder_layers_17_fc1_bias2, alloc310) + R.vm.kill_object(alloc309) + R.vm.kill_object(model_decoder_layers_17_fc1_weight2) + R.vm.kill_object(model_decoder_layers_17_fc1_bias2) + model_decoder_layers_17_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] + model_decoder_layers_17_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[918] + gv500: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc311: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv500, R.dtype("float16")) + _309: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_17_fc2_weight2, alloc310, model_decoder_layers_17_fc2_bias2, alloc311) + R.vm.kill_object(alloc310) + R.vm.kill_object(model_decoder_layers_17_fc2_weight2) + R.vm.kill_object(model_decoder_layers_17_fc2_bias2) + gv501: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc312: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv501, R.dtype("float16")) + cls.add5(alloc308, alloc311, alloc312) + R.vm.kill_object(alloc308) + R.vm.kill_object(alloc311) + model_decoder_layers_18_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[928] + model_decoder_layers_18_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[929] + gv502: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc313: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv502, R.dtype("float16")) + cls.layer_norm2(alloc312, model_decoder_layers_18_self_attn_layer_norm_weight2, model_decoder_layers_18_self_attn_layer_norm_bias2, alloc313) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias2) + model_decoder_layers_18_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] + model_decoder_layers_18_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[925] + gv503: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc314: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv503, R.dtype("float16")) + _312: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_q_proj_weight2, alloc313, model_decoder_layers_18_self_attn_q_proj_bias2, alloc314) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias2) + gv504: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape567: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc314, gv504, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc314) + model_decoder_layers_18_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] + gv505: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc315: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv505, R.dtype("float16")) + _313: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_18_self_attn_k_proj_weight2, alloc313, alloc315) + R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight2) + gv506: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape568: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc315, gv506, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc315) + model_decoder_layers_18_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] + model_decoder_layers_18_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[923] + gv507: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc316: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv507, R.dtype("float16")) + _314: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_v_proj_weight2, alloc313, model_decoder_layers_18_self_attn_v_proj_bias2, alloc316) + R.vm.kill_object(alloc313) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias2) + gv508: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape569: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc316, gv508, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc316) + gv509: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc317: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv509, R.dtype("float16")) + cls.concatenate1(reshape567, reshape568, reshape569, alloc317) + R.vm.kill_object(reshape567) + R.vm.kill_object(reshape568) + R.vm.kill_object(reshape569) + gv510: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape570: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc317, gv510, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc317) + gv511: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc318: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv511, R.dtype("float16")) + _316: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape570, alloc318) + R.vm.kill_object(reshape570) + gv512: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape571: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc318, gv512, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc318) + gv513: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape572: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape571, gv513, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape571) + model_decoder_layers_18_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] + model_decoder_layers_18_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[927] + gv514: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc319: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv514, R.dtype("float16")) + _317: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_out_proj_weight2, reshape572, model_decoder_layers_18_self_attn_out_proj_bias2, alloc319) + R.vm.kill_object(reshape572) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias2) + gv515: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc320: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv515, R.dtype("float16")) + cls.add5(alloc312, alloc319, alloc320) + R.vm.kill_object(alloc312) + R.vm.kill_object(alloc319) + model_decoder_layers_18_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[937] + model_decoder_layers_18_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[938] + gv516: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc321: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv516, R.dtype("float16")) + cls.layer_norm2(alloc320, model_decoder_layers_18_encoder_attn_layer_norm_weight2, model_decoder_layers_18_encoder_attn_layer_norm_bias2, alloc321) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias2) + model_decoder_layers_18_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] + model_decoder_layers_18_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[934] + gv517: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc322: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv517, R.dtype("float16")) + _320: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight2, alloc321, model_decoder_layers_18_encoder_attn_q_proj_bias2, alloc322) + R.vm.kill_object(alloc321) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias2) + gv518: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape573: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc322, gv518, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc322) + gv519: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape574: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape573, gv519, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape573) + gv520: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc323: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv520, R.dtype("float16")) + _321: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape574, alloc323) + R.vm.kill_object(reshape574) + gv521: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape575: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc323, gv521, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc323) + gv522: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape576: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape575, gv522, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape575) + model_decoder_layers_18_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] + model_decoder_layers_18_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[936] + gv523: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc324: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv523, R.dtype("float16")) + _322: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight2, reshape576, model_decoder_layers_18_encoder_attn_out_proj_bias2, alloc324) + R.vm.kill_object(reshape576) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias2) + gv524: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc325: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv524, R.dtype("float16")) + cls.add5(alloc320, alloc324, alloc325) + R.vm.kill_object(alloc320) + R.vm.kill_object(alloc324) + model_decoder_layers_18_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[943] + model_decoder_layers_18_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[944] + gv525: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc326: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv525, R.dtype("float16")) + cls.layer_norm2(alloc325, model_decoder_layers_18_final_layer_norm_weight2, model_decoder_layers_18_final_layer_norm_bias2, alloc326) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias2) + model_decoder_layers_18_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] + model_decoder_layers_18_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[940] + gv526: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc327: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv526, R.dtype("float16")) + _325: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_18_fc1_weight2, alloc326, model_decoder_layers_18_fc1_bias2, alloc327) + R.vm.kill_object(alloc326) + R.vm.kill_object(model_decoder_layers_18_fc1_weight2) + R.vm.kill_object(model_decoder_layers_18_fc1_bias2) + model_decoder_layers_18_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] + model_decoder_layers_18_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[942] + gv527: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc328: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv527, R.dtype("float16")) + _326: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_18_fc2_weight2, alloc327, model_decoder_layers_18_fc2_bias2, alloc328) + R.vm.kill_object(alloc327) + R.vm.kill_object(model_decoder_layers_18_fc2_weight2) + R.vm.kill_object(model_decoder_layers_18_fc2_bias2) + gv528: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc329: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv528, R.dtype("float16")) + cls.add5(alloc325, alloc328, alloc329) + R.vm.kill_object(alloc325) + R.vm.kill_object(alloc328) + model_decoder_layers_19_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[952] + model_decoder_layers_19_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[953] + gv529: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc330: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv529, R.dtype("float16")) + cls.layer_norm2(alloc329, model_decoder_layers_19_self_attn_layer_norm_weight2, model_decoder_layers_19_self_attn_layer_norm_bias2, alloc330) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias2) + model_decoder_layers_19_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] + model_decoder_layers_19_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[949] + gv530: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc331: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv530, R.dtype("float16")) + _329: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_q_proj_weight2, alloc330, model_decoder_layers_19_self_attn_q_proj_bias2, alloc331) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias2) + gv531: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape577: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc331, gv531, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc331) + model_decoder_layers_19_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] + gv532: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc332: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv532, R.dtype("float16")) + _330: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_19_self_attn_k_proj_weight2, alloc330, alloc332) + R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight2) + gv533: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape578: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc332, gv533, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc332) + model_decoder_layers_19_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] + model_decoder_layers_19_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[947] + gv534: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc333: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv534, R.dtype("float16")) + _331: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_v_proj_weight2, alloc330, model_decoder_layers_19_self_attn_v_proj_bias2, alloc333) + R.vm.kill_object(alloc330) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias2) + gv535: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape579: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc333, gv535, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc333) + gv536: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc334: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv536, R.dtype("float16")) + cls.concatenate1(reshape577, reshape578, reshape579, alloc334) + R.vm.kill_object(reshape577) + R.vm.kill_object(reshape578) + R.vm.kill_object(reshape579) + gv537: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape580: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc334, gv537, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc334) + gv538: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc335: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv538, R.dtype("float16")) + _333: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape580, alloc335) + R.vm.kill_object(reshape580) + gv539: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape581: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc335, gv539, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc335) + gv540: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape582: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape581, gv540, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape581) + model_decoder_layers_19_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] + model_decoder_layers_19_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[951] + gv541: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc336: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv541, R.dtype("float16")) + _334: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_out_proj_weight2, reshape582, model_decoder_layers_19_self_attn_out_proj_bias2, alloc336) + R.vm.kill_object(reshape582) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias2) + gv542: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc337: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv542, R.dtype("float16")) + cls.add5(alloc329, alloc336, alloc337) + R.vm.kill_object(alloc329) + R.vm.kill_object(alloc336) + model_decoder_layers_19_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[961] + model_decoder_layers_19_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[962] + gv543: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc338: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv543, R.dtype("float16")) + cls.layer_norm2(alloc337, model_decoder_layers_19_encoder_attn_layer_norm_weight2, model_decoder_layers_19_encoder_attn_layer_norm_bias2, alloc338) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias2) + model_decoder_layers_19_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] + model_decoder_layers_19_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[958] + gv544: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc339: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv544, R.dtype("float16")) + _337: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight2, alloc338, model_decoder_layers_19_encoder_attn_q_proj_bias2, alloc339) + R.vm.kill_object(alloc338) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias2) + gv545: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape583: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc339, gv545, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc339) + gv546: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape584: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape583, gv546, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape583) + gv547: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc340: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv547, R.dtype("float16")) + _338: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape584, alloc340) + R.vm.kill_object(reshape584) + gv548: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape585: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc340, gv548, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc340) + gv549: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape586: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape585, gv549, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape585) + model_decoder_layers_19_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] + model_decoder_layers_19_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[960] + gv550: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc341: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv550, R.dtype("float16")) + _339: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight2, reshape586, model_decoder_layers_19_encoder_attn_out_proj_bias2, alloc341) + R.vm.kill_object(reshape586) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias2) + gv551: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc342: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv551, R.dtype("float16")) + cls.add5(alloc337, alloc341, alloc342) + R.vm.kill_object(alloc337) + R.vm.kill_object(alloc341) + model_decoder_layers_19_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[967] + model_decoder_layers_19_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[968] + gv552: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc343: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv552, R.dtype("float16")) + cls.layer_norm2(alloc342, model_decoder_layers_19_final_layer_norm_weight2, model_decoder_layers_19_final_layer_norm_bias2, alloc343) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias2) + model_decoder_layers_19_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] + model_decoder_layers_19_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[964] + gv553: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc344: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv553, R.dtype("float16")) + _342: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_19_fc1_weight2, alloc343, model_decoder_layers_19_fc1_bias2, alloc344) + R.vm.kill_object(alloc343) + R.vm.kill_object(model_decoder_layers_19_fc1_weight2) + R.vm.kill_object(model_decoder_layers_19_fc1_bias2) + model_decoder_layers_19_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] + model_decoder_layers_19_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[966] + gv554: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc345: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv554, R.dtype("float16")) + _343: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_19_fc2_weight2, alloc344, model_decoder_layers_19_fc2_bias2, alloc345) + R.vm.kill_object(alloc344) + R.vm.kill_object(model_decoder_layers_19_fc2_weight2) + R.vm.kill_object(model_decoder_layers_19_fc2_bias2) + gv555: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc346: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv555, R.dtype("float16")) + cls.add5(alloc342, alloc345, alloc346) + R.vm.kill_object(alloc342) + R.vm.kill_object(alloc345) + model_decoder_layers_20_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[976] + model_decoder_layers_20_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[977] + gv556: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc347: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv556, R.dtype("float16")) + cls.layer_norm2(alloc346, model_decoder_layers_20_self_attn_layer_norm_weight2, model_decoder_layers_20_self_attn_layer_norm_bias2, alloc347) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias2) + model_decoder_layers_20_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] + model_decoder_layers_20_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[973] + gv557: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc348: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv557, R.dtype("float16")) + _346: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_q_proj_weight2, alloc347, model_decoder_layers_20_self_attn_q_proj_bias2, alloc348) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias2) + gv558: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape587: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc348, gv558, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc348) + model_decoder_layers_20_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] + gv559: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc349: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv559, R.dtype("float16")) + _347: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_20_self_attn_k_proj_weight2, alloc347, alloc349) + R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight2) + gv560: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape588: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc349, gv560, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc349) + model_decoder_layers_20_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] + model_decoder_layers_20_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[971] + gv561: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc350: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv561, R.dtype("float16")) + _348: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_v_proj_weight2, alloc347, model_decoder_layers_20_self_attn_v_proj_bias2, alloc350) + R.vm.kill_object(alloc347) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias2) + gv562: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape589: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc350, gv562, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc350) + gv563: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc351: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv563, R.dtype("float16")) + cls.concatenate1(reshape587, reshape588, reshape589, alloc351) + R.vm.kill_object(reshape587) + R.vm.kill_object(reshape588) + R.vm.kill_object(reshape589) + gv564: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape590: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc351, gv564, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc351) + gv565: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc352: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv565, R.dtype("float16")) + _350: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape590, alloc352) + R.vm.kill_object(reshape590) + gv566: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape591: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc352, gv566, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc352) + gv567: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape592: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape591, gv567, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape591) + model_decoder_layers_20_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] + model_decoder_layers_20_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[975] + gv568: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc353: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv568, R.dtype("float16")) + _351: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_out_proj_weight2, reshape592, model_decoder_layers_20_self_attn_out_proj_bias2, alloc353) + R.vm.kill_object(reshape592) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias2) + gv569: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc354: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv569, R.dtype("float16")) + cls.add5(alloc346, alloc353, alloc354) + R.vm.kill_object(alloc346) + R.vm.kill_object(alloc353) + model_decoder_layers_20_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[985] + model_decoder_layers_20_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[986] + gv570: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc355: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv570, R.dtype("float16")) + cls.layer_norm2(alloc354, model_decoder_layers_20_encoder_attn_layer_norm_weight2, model_decoder_layers_20_encoder_attn_layer_norm_bias2, alloc355) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias2) + model_decoder_layers_20_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] + model_decoder_layers_20_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[982] + gv571: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc356: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv571, R.dtype("float16")) + _354: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight2, alloc355, model_decoder_layers_20_encoder_attn_q_proj_bias2, alloc356) + R.vm.kill_object(alloc355) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias2) + gv572: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape593: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc356, gv572, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc356) + gv573: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape594: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape593, gv573, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape593) + gv574: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc357: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv574, R.dtype("float16")) + _355: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape594, alloc357) + R.vm.kill_object(reshape594) + gv575: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape595: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc357, gv575, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc357) + gv576: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape596: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape595, gv576, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape595) + model_decoder_layers_20_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] + model_decoder_layers_20_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[984] + gv577: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc358: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv577, R.dtype("float16")) + _356: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight2, reshape596, model_decoder_layers_20_encoder_attn_out_proj_bias2, alloc358) + R.vm.kill_object(reshape596) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias2) + gv578: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc359: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv578, R.dtype("float16")) + cls.add5(alloc354, alloc358, alloc359) + R.vm.kill_object(alloc354) + R.vm.kill_object(alloc358) + model_decoder_layers_20_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[991] + model_decoder_layers_20_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[992] + gv579: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc360: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv579, R.dtype("float16")) + cls.layer_norm2(alloc359, model_decoder_layers_20_final_layer_norm_weight2, model_decoder_layers_20_final_layer_norm_bias2, alloc360) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias2) + model_decoder_layers_20_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] + model_decoder_layers_20_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[988] + gv580: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc361: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv580, R.dtype("float16")) + _359: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_20_fc1_weight2, alloc360, model_decoder_layers_20_fc1_bias2, alloc361) + R.vm.kill_object(alloc360) + R.vm.kill_object(model_decoder_layers_20_fc1_weight2) + R.vm.kill_object(model_decoder_layers_20_fc1_bias2) + model_decoder_layers_20_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] + model_decoder_layers_20_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[990] + gv581: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc362: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv581, R.dtype("float16")) + _360: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_20_fc2_weight2, alloc361, model_decoder_layers_20_fc2_bias2, alloc362) + R.vm.kill_object(alloc361) + R.vm.kill_object(model_decoder_layers_20_fc2_weight2) + R.vm.kill_object(model_decoder_layers_20_fc2_bias2) + gv582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc363: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv582, R.dtype("float16")) + cls.add5(alloc359, alloc362, alloc363) + R.vm.kill_object(alloc359) + R.vm.kill_object(alloc362) + model_decoder_layers_21_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1000] + model_decoder_layers_21_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1001] + gv583: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc364: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv583, R.dtype("float16")) + cls.layer_norm2(alloc363, model_decoder_layers_21_self_attn_layer_norm_weight2, model_decoder_layers_21_self_attn_layer_norm_bias2, alloc364) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias2) + model_decoder_layers_21_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] + model_decoder_layers_21_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[997] + gv584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc365: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv584, R.dtype("float16")) + _363: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_q_proj_weight2, alloc364, model_decoder_layers_21_self_attn_q_proj_bias2, alloc365) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias2) + gv585: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape597: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc365, gv585, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc365) + model_decoder_layers_21_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] + gv586: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc366: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv586, R.dtype("float16")) + _364: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_21_self_attn_k_proj_weight2, alloc364, alloc366) + R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight2) + gv587: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape598: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc366, gv587, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc366) + model_decoder_layers_21_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] + model_decoder_layers_21_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[995] + gv588: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc367: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv588, R.dtype("float16")) + _365: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_v_proj_weight2, alloc364, model_decoder_layers_21_self_attn_v_proj_bias2, alloc367) + R.vm.kill_object(alloc364) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias2) + gv589: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape599: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc367, gv589, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc367) + gv590: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc368: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv590, R.dtype("float16")) + cls.concatenate1(reshape597, reshape598, reshape599, alloc368) + R.vm.kill_object(reshape597) + R.vm.kill_object(reshape598) + R.vm.kill_object(reshape599) + gv591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape600: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc368, gv591, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc368) + gv592: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc369: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv592, R.dtype("float16")) + _367: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape600, alloc369) + R.vm.kill_object(reshape600) + gv593: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape601: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc369, gv593, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc369) + gv594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape602: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape601, gv594, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape601) + model_decoder_layers_21_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] + model_decoder_layers_21_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[999] + gv595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc370: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv595, R.dtype("float16")) + _368: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_out_proj_weight2, reshape602, model_decoder_layers_21_self_attn_out_proj_bias2, alloc370) + R.vm.kill_object(reshape602) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias2) + gv596: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc371: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv596, R.dtype("float16")) + cls.add5(alloc363, alloc370, alloc371) + R.vm.kill_object(alloc363) + R.vm.kill_object(alloc370) + model_decoder_layers_21_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1009] + model_decoder_layers_21_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1010] + gv597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc372: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv597, R.dtype("float16")) + cls.layer_norm2(alloc371, model_decoder_layers_21_encoder_attn_layer_norm_weight2, model_decoder_layers_21_encoder_attn_layer_norm_bias2, alloc372) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias2) + model_decoder_layers_21_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] + model_decoder_layers_21_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1006] + gv598: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc373: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv598, R.dtype("float16")) + _371: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight2, alloc372, model_decoder_layers_21_encoder_attn_q_proj_bias2, alloc373) + R.vm.kill_object(alloc372) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias2) + gv599: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape603: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc373, gv599, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc373) + gv600: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape604: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape603, gv600, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape603) + gv601: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc374: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv601, R.dtype("float16")) + _372: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape604, alloc374) + R.vm.kill_object(reshape604) + gv602: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape605: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc374, gv602, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc374) + gv603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape606: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape605, gv603, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape605) + model_decoder_layers_21_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] + model_decoder_layers_21_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1008] + gv604: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc375: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv604, R.dtype("float16")) + _373: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight2, reshape606, model_decoder_layers_21_encoder_attn_out_proj_bias2, alloc375) + R.vm.kill_object(reshape606) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias2) + gv605: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc376: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv605, R.dtype("float16")) + cls.add5(alloc371, alloc375, alloc376) + R.vm.kill_object(alloc371) + R.vm.kill_object(alloc375) + model_decoder_layers_21_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1015] + model_decoder_layers_21_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1016] + gv606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc377: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv606, R.dtype("float16")) + cls.layer_norm2(alloc376, model_decoder_layers_21_final_layer_norm_weight2, model_decoder_layers_21_final_layer_norm_bias2, alloc377) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias2) + model_decoder_layers_21_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] + model_decoder_layers_21_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1012] + gv607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc378: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv607, R.dtype("float16")) + _376: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_21_fc1_weight2, alloc377, model_decoder_layers_21_fc1_bias2, alloc378) + R.vm.kill_object(alloc377) + R.vm.kill_object(model_decoder_layers_21_fc1_weight2) + R.vm.kill_object(model_decoder_layers_21_fc1_bias2) + model_decoder_layers_21_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] + model_decoder_layers_21_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1014] + gv608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc379: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv608, R.dtype("float16")) + _377: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_21_fc2_weight2, alloc378, model_decoder_layers_21_fc2_bias2, alloc379) + R.vm.kill_object(alloc378) + R.vm.kill_object(model_decoder_layers_21_fc2_weight2) + R.vm.kill_object(model_decoder_layers_21_fc2_bias2) + gv609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc380: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv609, R.dtype("float16")) + cls.add5(alloc376, alloc379, alloc380) + R.vm.kill_object(alloc376) + R.vm.kill_object(alloc379) + model_decoder_layers_22_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1024] + model_decoder_layers_22_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1025] + gv610: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc381: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv610, R.dtype("float16")) + cls.layer_norm2(alloc380, model_decoder_layers_22_self_attn_layer_norm_weight2, model_decoder_layers_22_self_attn_layer_norm_bias2, alloc381) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias2) + model_decoder_layers_22_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] + model_decoder_layers_22_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1021] + gv611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc382: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv611, R.dtype("float16")) + _380: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_q_proj_weight2, alloc381, model_decoder_layers_22_self_attn_q_proj_bias2, alloc382) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias2) + gv612: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape607: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc382, gv612, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc382) + model_decoder_layers_22_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] + gv613: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc383: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv613, R.dtype("float16")) + _381: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_22_self_attn_k_proj_weight2, alloc381, alloc383) + R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight2) + gv614: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape608: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc383, gv614, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc383) + model_decoder_layers_22_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] + model_decoder_layers_22_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1019] + gv615: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc384: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv615, R.dtype("float16")) + _382: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_v_proj_weight2, alloc381, model_decoder_layers_22_self_attn_v_proj_bias2, alloc384) + R.vm.kill_object(alloc381) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias2) + gv616: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape609: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc384, gv616, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc384) + gv617: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc385: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv617, R.dtype("float16")) + cls.concatenate1(reshape607, reshape608, reshape609, alloc385) + R.vm.kill_object(reshape607) + R.vm.kill_object(reshape608) + R.vm.kill_object(reshape609) + gv618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape610: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc385, gv618, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc385) + gv619: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc386: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv619, R.dtype("float16")) + _384: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape610, alloc386) + R.vm.kill_object(reshape610) + gv620: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape611: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc386, gv620, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc386) + gv621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape612: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape611, gv621, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape611) + model_decoder_layers_22_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] + model_decoder_layers_22_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1023] + gv622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc387: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv622, R.dtype("float16")) + _385: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_out_proj_weight2, reshape612, model_decoder_layers_22_self_attn_out_proj_bias2, alloc387) + R.vm.kill_object(reshape612) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias2) + gv623: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc388: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv623, R.dtype("float16")) + cls.add5(alloc380, alloc387, alloc388) + R.vm.kill_object(alloc380) + R.vm.kill_object(alloc387) + model_decoder_layers_22_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1033] + model_decoder_layers_22_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1034] + gv624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc389: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv624, R.dtype("float16")) + cls.layer_norm2(alloc388, model_decoder_layers_22_encoder_attn_layer_norm_weight2, model_decoder_layers_22_encoder_attn_layer_norm_bias2, alloc389) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias2) + model_decoder_layers_22_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] + model_decoder_layers_22_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1030] + gv625: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc390: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv625, R.dtype("float16")) + _388: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight2, alloc389, model_decoder_layers_22_encoder_attn_q_proj_bias2, alloc390) + R.vm.kill_object(alloc389) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias2) + gv626: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape613: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc390, gv626, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc390) + gv627: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape614: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape613, gv627, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape613) + gv628: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc391: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv628, R.dtype("float16")) + _389: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape614, alloc391) + R.vm.kill_object(reshape614) + gv629: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape615: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc391, gv629, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc391) + gv630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape616: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape615, gv630, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape615) + model_decoder_layers_22_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] + model_decoder_layers_22_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1032] + gv631: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc392: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv631, R.dtype("float16")) + _390: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight2, reshape616, model_decoder_layers_22_encoder_attn_out_proj_bias2, alloc392) + R.vm.kill_object(reshape616) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias2) + gv632: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc393: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv632, R.dtype("float16")) + cls.add5(alloc388, alloc392, alloc393) + R.vm.kill_object(alloc388) + R.vm.kill_object(alloc392) + model_decoder_layers_22_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1039] + model_decoder_layers_22_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1040] + gv633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc394: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv633, R.dtype("float16")) + cls.layer_norm2(alloc393, model_decoder_layers_22_final_layer_norm_weight2, model_decoder_layers_22_final_layer_norm_bias2, alloc394) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias2) + model_decoder_layers_22_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] + model_decoder_layers_22_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1036] + gv634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc395: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv634, R.dtype("float16")) + _393: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_22_fc1_weight2, alloc394, model_decoder_layers_22_fc1_bias2, alloc395) + R.vm.kill_object(alloc394) + R.vm.kill_object(model_decoder_layers_22_fc1_weight2) + R.vm.kill_object(model_decoder_layers_22_fc1_bias2) + model_decoder_layers_22_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] + model_decoder_layers_22_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1038] + gv635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc396: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv635, R.dtype("float16")) + _394: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_22_fc2_weight2, alloc395, model_decoder_layers_22_fc2_bias2, alloc396) + R.vm.kill_object(alloc395) + R.vm.kill_object(model_decoder_layers_22_fc2_weight2) + R.vm.kill_object(model_decoder_layers_22_fc2_bias2) + gv636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc397: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv636, R.dtype("float16")) + cls.add5(alloc393, alloc396, alloc397) + R.vm.kill_object(alloc393) + R.vm.kill_object(alloc396) + model_decoder_layers_23_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1048] + model_decoder_layers_23_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1049] + gv637: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc398: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv637, R.dtype("float16")) + cls.layer_norm2(alloc397, model_decoder_layers_23_self_attn_layer_norm_weight2, model_decoder_layers_23_self_attn_layer_norm_bias2, alloc398) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias2) + model_decoder_layers_23_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] + model_decoder_layers_23_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1045] + gv638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc399: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv638, R.dtype("float16")) + _397: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_q_proj_weight2, alloc398, model_decoder_layers_23_self_attn_q_proj_bias2, alloc399) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias2) + gv639: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape617: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc399, gv639, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc399) + model_decoder_layers_23_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] + gv640: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc400: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv640, R.dtype("float16")) + _398: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_23_self_attn_k_proj_weight2, alloc398, alloc400) + R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight2) + gv641: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape618: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc400, gv641, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc400) + model_decoder_layers_23_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] + model_decoder_layers_23_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1043] + gv642: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc401: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv642, R.dtype("float16")) + _399: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_v_proj_weight2, alloc398, model_decoder_layers_23_self_attn_v_proj_bias2, alloc401) + R.vm.kill_object(alloc398) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias2) + gv643: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape619: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc401, gv643, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc401) + gv644: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc402: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv644, R.dtype("float16")) + cls.concatenate1(reshape617, reshape618, reshape619, alloc402) + R.vm.kill_object(reshape617) + R.vm.kill_object(reshape618) + R.vm.kill_object(reshape619) + gv645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape620: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc402, gv645, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc402) + gv646: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc403: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv646, R.dtype("float16")) + _401: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape620, alloc403) + R.vm.kill_object(reshape620) + gv647: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape621: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc403, gv647, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc403) + gv648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape622: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape621, gv648, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape621) + model_decoder_layers_23_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] + model_decoder_layers_23_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1047] + gv649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc404: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv649, R.dtype("float16")) + _402: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_out_proj_weight2, reshape622, model_decoder_layers_23_self_attn_out_proj_bias2, alloc404) + R.vm.kill_object(reshape622) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias2) + gv650: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc405: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv650, R.dtype("float16")) + cls.add5(alloc397, alloc404, alloc405) + R.vm.kill_object(alloc397) + R.vm.kill_object(alloc404) + model_decoder_layers_23_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1057] + model_decoder_layers_23_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1058] + gv651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc406: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv651, R.dtype("float16")) + cls.layer_norm2(alloc405, model_decoder_layers_23_encoder_attn_layer_norm_weight2, model_decoder_layers_23_encoder_attn_layer_norm_bias2, alloc406) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias2) + model_decoder_layers_23_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] + model_decoder_layers_23_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1054] + gv652: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc407: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv652, R.dtype("float16")) + _405: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight2, alloc406, model_decoder_layers_23_encoder_attn_q_proj_bias2, alloc407) + R.vm.kill_object(alloc406) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias2) + gv653: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape623: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc407, gv653, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc407) + gv654: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape624: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape623, gv654, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape623) + gv655: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc408: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv655, R.dtype("float16")) + _406: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape624, alloc408) + R.vm.kill_object(reshape624) + gv656: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape625: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc408, gv656, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc408) + gv657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape626: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape625, gv657, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape625) + model_decoder_layers_23_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] + model_decoder_layers_23_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1056] + gv658: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc409: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv658, R.dtype("float16")) + _407: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight2, reshape626, model_decoder_layers_23_encoder_attn_out_proj_bias2, alloc409) + R.vm.kill_object(reshape626) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias2) + gv659: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc410: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv659, R.dtype("float16")) + cls.add5(alloc405, alloc409, alloc410) + R.vm.kill_object(alloc405) + R.vm.kill_object(alloc409) + model_decoder_layers_23_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1063] + model_decoder_layers_23_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1064] + gv660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc411: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv660, R.dtype("float16")) + cls.layer_norm2(alloc410, model_decoder_layers_23_final_layer_norm_weight2, model_decoder_layers_23_final_layer_norm_bias2, alloc411) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias2) + model_decoder_layers_23_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] + model_decoder_layers_23_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1060] + gv661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc412: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv661, R.dtype("float16")) + _410: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_23_fc1_weight2, alloc411, model_decoder_layers_23_fc1_bias2, alloc412) + R.vm.kill_object(alloc411) + R.vm.kill_object(model_decoder_layers_23_fc1_weight2) + R.vm.kill_object(model_decoder_layers_23_fc1_bias2) + model_decoder_layers_23_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] + model_decoder_layers_23_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1062] + gv662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc413: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv662, R.dtype("float16")) + _411: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_23_fc2_weight2, alloc412, model_decoder_layers_23_fc2_bias2, alloc413) + R.vm.kill_object(alloc412) + R.vm.kill_object(model_decoder_layers_23_fc2_weight2) + R.vm.kill_object(model_decoder_layers_23_fc2_bias2) + gv663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc414: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv663, R.dtype("float16")) + cls.add5(alloc410, alloc413, alloc414) + R.vm.kill_object(alloc410) + R.vm.kill_object(alloc413) + model_decoder_layers_24_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1072] + model_decoder_layers_24_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1073] + gv664: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc415: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv664, R.dtype("float16")) + cls.layer_norm2(alloc414, model_decoder_layers_24_self_attn_layer_norm_weight2, model_decoder_layers_24_self_attn_layer_norm_bias2, alloc415) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias2) + model_decoder_layers_24_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] + model_decoder_layers_24_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1069] + gv665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc416: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv665, R.dtype("float16")) + _414: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_q_proj_weight2, alloc415, model_decoder_layers_24_self_attn_q_proj_bias2, alloc416) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias2) + gv666: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape627: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc416, gv666, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc416) + model_decoder_layers_24_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] + gv667: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc417: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv667, R.dtype("float16")) + _415: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_24_self_attn_k_proj_weight2, alloc415, alloc417) + R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight2) + gv668: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape628: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc417, gv668, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc417) + model_decoder_layers_24_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] + model_decoder_layers_24_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1067] + gv669: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc418: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv669, R.dtype("float16")) + _416: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_v_proj_weight2, alloc415, model_decoder_layers_24_self_attn_v_proj_bias2, alloc418) + R.vm.kill_object(alloc415) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias2) + gv670: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape629: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc418, gv670, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc418) + gv671: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc419: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv671, R.dtype("float16")) + cls.concatenate1(reshape627, reshape628, reshape629, alloc419) + R.vm.kill_object(reshape627) + R.vm.kill_object(reshape628) + R.vm.kill_object(reshape629) + gv672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape630: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc419, gv672, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc419) + gv673: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc420: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv673, R.dtype("float16")) + _418: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape630, alloc420) + R.vm.kill_object(reshape630) + gv674: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape631: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc420, gv674, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc420) + gv675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape632: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape631, gv675, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape631) + model_decoder_layers_24_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] + model_decoder_layers_24_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1071] + gv676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc421: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv676, R.dtype("float16")) + _419: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_out_proj_weight2, reshape632, model_decoder_layers_24_self_attn_out_proj_bias2, alloc421) + R.vm.kill_object(reshape632) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias2) + gv677: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc422: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv677, R.dtype("float16")) + cls.add5(alloc414, alloc421, alloc422) + R.vm.kill_object(alloc414) + R.vm.kill_object(alloc421) + model_decoder_layers_24_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1081] + model_decoder_layers_24_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1082] + gv678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc423: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv678, R.dtype("float16")) + cls.layer_norm2(alloc422, model_decoder_layers_24_encoder_attn_layer_norm_weight2, model_decoder_layers_24_encoder_attn_layer_norm_bias2, alloc423) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias2) + model_decoder_layers_24_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] + model_decoder_layers_24_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1078] + gv679: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc424: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv679, R.dtype("float16")) + _422: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight2, alloc423, model_decoder_layers_24_encoder_attn_q_proj_bias2, alloc424) + R.vm.kill_object(alloc423) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias2) + gv680: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape633: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc424, gv680, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc424) + gv681: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape634: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape633, gv681, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape633) + gv682: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc425: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv682, R.dtype("float16")) + _423: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape634, alloc425) + R.vm.kill_object(reshape634) + gv683: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape635: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc425, gv683, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc425) + gv684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape636: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape635, gv684, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape635) + model_decoder_layers_24_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] + model_decoder_layers_24_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1080] + gv685: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc426: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv685, R.dtype("float16")) + _424: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight2, reshape636, model_decoder_layers_24_encoder_attn_out_proj_bias2, alloc426) + R.vm.kill_object(reshape636) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias2) + gv686: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc427: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv686, R.dtype("float16")) + cls.add5(alloc422, alloc426, alloc427) + R.vm.kill_object(alloc422) + R.vm.kill_object(alloc426) + model_decoder_layers_24_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1087] + model_decoder_layers_24_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1088] + gv687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc428: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv687, R.dtype("float16")) + cls.layer_norm2(alloc427, model_decoder_layers_24_final_layer_norm_weight2, model_decoder_layers_24_final_layer_norm_bias2, alloc428) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias2) + model_decoder_layers_24_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] + model_decoder_layers_24_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1084] + gv688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc429: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv688, R.dtype("float16")) + _427: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_24_fc1_weight2, alloc428, model_decoder_layers_24_fc1_bias2, alloc429) + R.vm.kill_object(alloc428) + R.vm.kill_object(model_decoder_layers_24_fc1_weight2) + R.vm.kill_object(model_decoder_layers_24_fc1_bias2) + model_decoder_layers_24_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] + model_decoder_layers_24_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1086] + gv689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc430: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv689, R.dtype("float16")) + _428: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_24_fc2_weight2, alloc429, model_decoder_layers_24_fc2_bias2, alloc430) + R.vm.kill_object(alloc429) + R.vm.kill_object(model_decoder_layers_24_fc2_weight2) + R.vm.kill_object(model_decoder_layers_24_fc2_bias2) + gv690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc431: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv690, R.dtype("float16")) + cls.add5(alloc427, alloc430, alloc431) + R.vm.kill_object(alloc427) + R.vm.kill_object(alloc430) + model_decoder_layers_25_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1096] + model_decoder_layers_25_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1097] + gv691: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc432: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv691, R.dtype("float16")) + cls.layer_norm2(alloc431, model_decoder_layers_25_self_attn_layer_norm_weight2, model_decoder_layers_25_self_attn_layer_norm_bias2, alloc432) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias2) + model_decoder_layers_25_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] + model_decoder_layers_25_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1093] + gv692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc433: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv692, R.dtype("float16")) + _431: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_q_proj_weight2, alloc432, model_decoder_layers_25_self_attn_q_proj_bias2, alloc433) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias2) + gv693: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape637: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc433, gv693, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc433) + model_decoder_layers_25_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] + gv694: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc434: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv694, R.dtype("float16")) + _432: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_25_self_attn_k_proj_weight2, alloc432, alloc434) + R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight2) + gv695: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape638: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc434, gv695, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc434) + model_decoder_layers_25_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] + model_decoder_layers_25_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1091] + gv696: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc435: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv696, R.dtype("float16")) + _433: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_v_proj_weight2, alloc432, model_decoder_layers_25_self_attn_v_proj_bias2, alloc435) + R.vm.kill_object(alloc432) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias2) + gv697: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape639: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc435, gv697, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc435) + gv698: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc436: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv698, R.dtype("float16")) + cls.concatenate1(reshape637, reshape638, reshape639, alloc436) + R.vm.kill_object(reshape637) + R.vm.kill_object(reshape638) + R.vm.kill_object(reshape639) + gv699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape640: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc436, gv699, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc436) + gv700: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc437: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv700, R.dtype("float16")) + _435: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape640, alloc437) + R.vm.kill_object(reshape640) + gv701: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape641: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc437, gv701, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc437) + gv702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape642: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape641, gv702, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape641) + model_decoder_layers_25_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] + model_decoder_layers_25_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1095] + gv703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc438: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv703, R.dtype("float16")) + _436: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_out_proj_weight2, reshape642, model_decoder_layers_25_self_attn_out_proj_bias2, alloc438) + R.vm.kill_object(reshape642) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias2) + gv704: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc439: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv704, R.dtype("float16")) + cls.add5(alloc431, alloc438, alloc439) + R.vm.kill_object(alloc431) + R.vm.kill_object(alloc438) + model_decoder_layers_25_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1105] + model_decoder_layers_25_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1106] + gv705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc440: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv705, R.dtype("float16")) + cls.layer_norm2(alloc439, model_decoder_layers_25_encoder_attn_layer_norm_weight2, model_decoder_layers_25_encoder_attn_layer_norm_bias2, alloc440) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias2) + model_decoder_layers_25_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] + model_decoder_layers_25_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1102] + gv706: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc441: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv706, R.dtype("float16")) + _439: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight2, alloc440, model_decoder_layers_25_encoder_attn_q_proj_bias2, alloc441) + R.vm.kill_object(alloc440) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias2) + gv707: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape643: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc441, gv707, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc441) + gv708: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape644: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape643, gv708, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape643) + gv709: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc442: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv709, R.dtype("float16")) + _440: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape644, alloc442) + R.vm.kill_object(reshape644) + gv710: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape645: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc442, gv710, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc442) + gv711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape646: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape645, gv711, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape645) + model_decoder_layers_25_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] + model_decoder_layers_25_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1104] + gv712: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc443: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv712, R.dtype("float16")) + _441: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight2, reshape646, model_decoder_layers_25_encoder_attn_out_proj_bias2, alloc443) + R.vm.kill_object(reshape646) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias2) + gv713: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc444: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv713, R.dtype("float16")) + cls.add5(alloc439, alloc443, alloc444) + R.vm.kill_object(alloc439) + R.vm.kill_object(alloc443) + model_decoder_layers_25_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1111] + model_decoder_layers_25_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1112] + gv714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc445: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv714, R.dtype("float16")) + cls.layer_norm2(alloc444, model_decoder_layers_25_final_layer_norm_weight2, model_decoder_layers_25_final_layer_norm_bias2, alloc445) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias2) + model_decoder_layers_25_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] + model_decoder_layers_25_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1108] + gv715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc446: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv715, R.dtype("float16")) + _444: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_25_fc1_weight2, alloc445, model_decoder_layers_25_fc1_bias2, alloc446) + R.vm.kill_object(alloc445) + R.vm.kill_object(model_decoder_layers_25_fc1_weight2) + R.vm.kill_object(model_decoder_layers_25_fc1_bias2) + model_decoder_layers_25_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] + model_decoder_layers_25_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1110] + gv716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc447: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv716, R.dtype("float16")) + _445: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_25_fc2_weight2, alloc446, model_decoder_layers_25_fc2_bias2, alloc447) + R.vm.kill_object(alloc446) + R.vm.kill_object(model_decoder_layers_25_fc2_weight2) + R.vm.kill_object(model_decoder_layers_25_fc2_bias2) + gv717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc448: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv717, R.dtype("float16")) + cls.add5(alloc444, alloc447, alloc448) + R.vm.kill_object(alloc444) + R.vm.kill_object(alloc447) + model_decoder_layers_26_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1120] + model_decoder_layers_26_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1121] + gv718: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc449: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv718, R.dtype("float16")) + cls.layer_norm2(alloc448, model_decoder_layers_26_self_attn_layer_norm_weight2, model_decoder_layers_26_self_attn_layer_norm_bias2, alloc449) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias2) + model_decoder_layers_26_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] + model_decoder_layers_26_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1117] + gv719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc450: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv719, R.dtype("float16")) + _448: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_q_proj_weight2, alloc449, model_decoder_layers_26_self_attn_q_proj_bias2, alloc450) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias2) + gv720: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape647: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc450, gv720, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc450) + model_decoder_layers_26_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] + gv721: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc451: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv721, R.dtype("float16")) + _449: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_26_self_attn_k_proj_weight2, alloc449, alloc451) + R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight2) + gv722: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape648: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc451, gv722, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc451) + model_decoder_layers_26_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] + model_decoder_layers_26_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1115] + gv723: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc452: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv723, R.dtype("float16")) + _450: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_v_proj_weight2, alloc449, model_decoder_layers_26_self_attn_v_proj_bias2, alloc452) + R.vm.kill_object(alloc449) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias2) + gv724: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape649: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc452, gv724, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc452) + gv725: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc453: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv725, R.dtype("float16")) + cls.concatenate1(reshape647, reshape648, reshape649, alloc453) + R.vm.kill_object(reshape647) + R.vm.kill_object(reshape648) + R.vm.kill_object(reshape649) + gv726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape650: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc453, gv726, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc453) + gv727: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc454: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv727, R.dtype("float16")) + _452: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape650, alloc454) + R.vm.kill_object(reshape650) + gv728: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape651: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc454, gv728, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc454) + gv729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape652: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape651, gv729, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape651) + model_decoder_layers_26_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] + model_decoder_layers_26_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1119] + gv730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc455: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv730, R.dtype("float16")) + _453: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_out_proj_weight2, reshape652, model_decoder_layers_26_self_attn_out_proj_bias2, alloc455) + R.vm.kill_object(reshape652) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias2) + gv731: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc456: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv731, R.dtype("float16")) + cls.add5(alloc448, alloc455, alloc456) + R.vm.kill_object(alloc448) + R.vm.kill_object(alloc455) + model_decoder_layers_26_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1129] + model_decoder_layers_26_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1130] + gv732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc457: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv732, R.dtype("float16")) + cls.layer_norm2(alloc456, model_decoder_layers_26_encoder_attn_layer_norm_weight2, model_decoder_layers_26_encoder_attn_layer_norm_bias2, alloc457) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias2) + model_decoder_layers_26_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] + model_decoder_layers_26_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1126] + gv733: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc458: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv733, R.dtype("float16")) + _456: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight2, alloc457, model_decoder_layers_26_encoder_attn_q_proj_bias2, alloc458) + R.vm.kill_object(alloc457) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias2) + gv734: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape653: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc458, gv734, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc458) + gv735: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape654: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape653, gv735, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape653) + gv736: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc459: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv736, R.dtype("float16")) + _457: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape654, alloc459) + R.vm.kill_object(reshape654) + gv737: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape655: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc459, gv737, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc459) + gv738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape656: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape655, gv738, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape655) + model_decoder_layers_26_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] + model_decoder_layers_26_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1128] + gv739: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc460: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv739, R.dtype("float16")) + _458: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight2, reshape656, model_decoder_layers_26_encoder_attn_out_proj_bias2, alloc460) + R.vm.kill_object(reshape656) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias2) + gv740: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc461: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv740, R.dtype("float16")) + cls.add5(alloc456, alloc460, alloc461) + R.vm.kill_object(alloc456) + R.vm.kill_object(alloc460) + model_decoder_layers_26_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1135] + model_decoder_layers_26_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1136] + gv741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc462: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv741, R.dtype("float16")) + cls.layer_norm2(alloc461, model_decoder_layers_26_final_layer_norm_weight2, model_decoder_layers_26_final_layer_norm_bias2, alloc462) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias2) + model_decoder_layers_26_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] + model_decoder_layers_26_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1132] + gv742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc463: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv742, R.dtype("float16")) + _461: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_26_fc1_weight2, alloc462, model_decoder_layers_26_fc1_bias2, alloc463) + R.vm.kill_object(alloc462) + R.vm.kill_object(model_decoder_layers_26_fc1_weight2) + R.vm.kill_object(model_decoder_layers_26_fc1_bias2) + model_decoder_layers_26_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] + model_decoder_layers_26_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1134] + gv743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc464: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv743, R.dtype("float16")) + _462: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_26_fc2_weight2, alloc463, model_decoder_layers_26_fc2_bias2, alloc464) + R.vm.kill_object(alloc463) + R.vm.kill_object(model_decoder_layers_26_fc2_weight2) + R.vm.kill_object(model_decoder_layers_26_fc2_bias2) + gv744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc465: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv744, R.dtype("float16")) + cls.add5(alloc461, alloc464, alloc465) + R.vm.kill_object(alloc461) + R.vm.kill_object(alloc464) + model_decoder_layers_27_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1144] + model_decoder_layers_27_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1145] + gv745: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc466: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv745, R.dtype("float16")) + cls.layer_norm2(alloc465, model_decoder_layers_27_self_attn_layer_norm_weight2, model_decoder_layers_27_self_attn_layer_norm_bias2, alloc466) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias2) + model_decoder_layers_27_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] + model_decoder_layers_27_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1141] + gv746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc467: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv746, R.dtype("float16")) + _465: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_q_proj_weight2, alloc466, model_decoder_layers_27_self_attn_q_proj_bias2, alloc467) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias2) + gv747: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape657: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc467, gv747, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc467) + model_decoder_layers_27_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] + gv748: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc468: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv748, R.dtype("float16")) + _466: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_27_self_attn_k_proj_weight2, alloc466, alloc468) + R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight2) + gv749: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape658: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc468, gv749, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc468) + model_decoder_layers_27_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] + model_decoder_layers_27_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1139] + gv750: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc469: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv750, R.dtype("float16")) + _467: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_v_proj_weight2, alloc466, model_decoder_layers_27_self_attn_v_proj_bias2, alloc469) + R.vm.kill_object(alloc466) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias2) + gv751: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape659: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc469, gv751, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc469) + gv752: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc470: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv752, R.dtype("float16")) + cls.concatenate1(reshape657, reshape658, reshape659, alloc470) + R.vm.kill_object(reshape657) + R.vm.kill_object(reshape658) + R.vm.kill_object(reshape659) + gv753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape660: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc470, gv753, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc470) + gv754: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc471: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv754, R.dtype("float16")) + _469: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape660, alloc471) + R.vm.kill_object(reshape660) + gv755: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape661: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc471, gv755, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc471) + gv756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape662: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape661, gv756, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape661) + model_decoder_layers_27_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] + model_decoder_layers_27_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1143] + gv757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc472: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv757, R.dtype("float16")) + _470: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_out_proj_weight2, reshape662, model_decoder_layers_27_self_attn_out_proj_bias2, alloc472) + R.vm.kill_object(reshape662) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias2) + gv758: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc473: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv758, R.dtype("float16")) + cls.add5(alloc465, alloc472, alloc473) + R.vm.kill_object(alloc465) + R.vm.kill_object(alloc472) + model_decoder_layers_27_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1153] + model_decoder_layers_27_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1154] + gv759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc474: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv759, R.dtype("float16")) + cls.layer_norm2(alloc473, model_decoder_layers_27_encoder_attn_layer_norm_weight2, model_decoder_layers_27_encoder_attn_layer_norm_bias2, alloc474) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias2) + model_decoder_layers_27_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] + model_decoder_layers_27_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1150] + gv760: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc475: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv760, R.dtype("float16")) + _473: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight2, alloc474, model_decoder_layers_27_encoder_attn_q_proj_bias2, alloc475) + R.vm.kill_object(alloc474) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias2) + gv761: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape663: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc475, gv761, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc475) + gv762: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape664: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape663, gv762, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape663) + gv763: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc476: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv763, R.dtype("float16")) + _474: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape664, alloc476) + R.vm.kill_object(reshape664) + gv764: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape665: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc476, gv764, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc476) + gv765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape666: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape665, gv765, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape665) + model_decoder_layers_27_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] + model_decoder_layers_27_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1152] + gv766: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc477: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv766, R.dtype("float16")) + _475: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight2, reshape666, model_decoder_layers_27_encoder_attn_out_proj_bias2, alloc477) + R.vm.kill_object(reshape666) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias2) + gv767: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc478: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv767, R.dtype("float16")) + cls.add5(alloc473, alloc477, alloc478) + R.vm.kill_object(alloc473) + R.vm.kill_object(alloc477) + model_decoder_layers_27_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1159] + model_decoder_layers_27_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1160] + gv768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc479: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv768, R.dtype("float16")) + cls.layer_norm2(alloc478, model_decoder_layers_27_final_layer_norm_weight2, model_decoder_layers_27_final_layer_norm_bias2, alloc479) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias2) + model_decoder_layers_27_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] + model_decoder_layers_27_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1156] + gv769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc480: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv769, R.dtype("float16")) + _478: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_27_fc1_weight2, alloc479, model_decoder_layers_27_fc1_bias2, alloc480) + R.vm.kill_object(alloc479) + R.vm.kill_object(model_decoder_layers_27_fc1_weight2) + R.vm.kill_object(model_decoder_layers_27_fc1_bias2) + model_decoder_layers_27_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] + model_decoder_layers_27_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1158] + gv770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc481: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv770, R.dtype("float16")) + _479: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_27_fc2_weight2, alloc480, model_decoder_layers_27_fc2_bias2, alloc481) + R.vm.kill_object(alloc480) + R.vm.kill_object(model_decoder_layers_27_fc2_weight2) + R.vm.kill_object(model_decoder_layers_27_fc2_bias2) + gv771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc482: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv771, R.dtype("float16")) + cls.add5(alloc478, alloc481, alloc482) + R.vm.kill_object(alloc478) + R.vm.kill_object(alloc481) + model_decoder_layers_28_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1168] + model_decoder_layers_28_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1169] + gv772: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc483: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv772, R.dtype("float16")) + cls.layer_norm2(alloc482, model_decoder_layers_28_self_attn_layer_norm_weight2, model_decoder_layers_28_self_attn_layer_norm_bias2, alloc483) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias2) + model_decoder_layers_28_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] + model_decoder_layers_28_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1165] + gv773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc484: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv773, R.dtype("float16")) + _482: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_q_proj_weight2, alloc483, model_decoder_layers_28_self_attn_q_proj_bias2, alloc484) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias2) + gv774: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape667: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc484, gv774, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc484) + model_decoder_layers_28_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] + gv775: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc485: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv775, R.dtype("float16")) + _483: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_28_self_attn_k_proj_weight2, alloc483, alloc485) + R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight2) + gv776: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape668: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc485, gv776, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc485) + model_decoder_layers_28_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] + model_decoder_layers_28_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1163] + gv777: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc486: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv777, R.dtype("float16")) + _484: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_v_proj_weight2, alloc483, model_decoder_layers_28_self_attn_v_proj_bias2, alloc486) + R.vm.kill_object(alloc483) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias2) + gv778: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape669: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc486, gv778, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc486) + gv779: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc487: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv779, R.dtype("float16")) + cls.concatenate1(reshape667, reshape668, reshape669, alloc487) + R.vm.kill_object(reshape667) + R.vm.kill_object(reshape668) + R.vm.kill_object(reshape669) + gv780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape670: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc487, gv780, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc487) + gv781: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc488: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv781, R.dtype("float16")) + _486: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape670, alloc488) + R.vm.kill_object(reshape670) + gv782: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape671: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc488, gv782, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc488) + gv783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape672: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape671, gv783, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape671) + model_decoder_layers_28_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] + model_decoder_layers_28_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1167] + gv784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc489: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv784, R.dtype("float16")) + _487: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_out_proj_weight2, reshape672, model_decoder_layers_28_self_attn_out_proj_bias2, alloc489) + R.vm.kill_object(reshape672) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias2) + gv785: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc490: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv785, R.dtype("float16")) + cls.add5(alloc482, alloc489, alloc490) + R.vm.kill_object(alloc482) + R.vm.kill_object(alloc489) + model_decoder_layers_28_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1177] + model_decoder_layers_28_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1178] + gv786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc491: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv786, R.dtype("float16")) + cls.layer_norm2(alloc490, model_decoder_layers_28_encoder_attn_layer_norm_weight2, model_decoder_layers_28_encoder_attn_layer_norm_bias2, alloc491) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias2) + model_decoder_layers_28_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] + model_decoder_layers_28_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1174] + gv787: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc492: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv787, R.dtype("float16")) + _490: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight2, alloc491, model_decoder_layers_28_encoder_attn_q_proj_bias2, alloc492) + R.vm.kill_object(alloc491) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias2) + gv788: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape673: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc492, gv788, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc492) + gv789: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape674: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape673, gv789, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape673) + gv790: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc493: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv790, R.dtype("float16")) + _491: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape674, alloc493) + R.vm.kill_object(reshape674) + gv791: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape675: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc493, gv791, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc493) + gv792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape676: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape675, gv792, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape675) + model_decoder_layers_28_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] + model_decoder_layers_28_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1176] + gv793: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc494: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv793, R.dtype("float16")) + _492: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight2, reshape676, model_decoder_layers_28_encoder_attn_out_proj_bias2, alloc494) + R.vm.kill_object(reshape676) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias2) + gv794: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc495: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv794, R.dtype("float16")) + cls.add5(alloc490, alloc494, alloc495) + R.vm.kill_object(alloc490) + R.vm.kill_object(alloc494) + model_decoder_layers_28_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1183] + model_decoder_layers_28_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1184] + gv795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc496: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv795, R.dtype("float16")) + cls.layer_norm2(alloc495, model_decoder_layers_28_final_layer_norm_weight2, model_decoder_layers_28_final_layer_norm_bias2, alloc496) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias2) + model_decoder_layers_28_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] + model_decoder_layers_28_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1180] + gv796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc497: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv796, R.dtype("float16")) + _495: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_28_fc1_weight2, alloc496, model_decoder_layers_28_fc1_bias2, alloc497) + R.vm.kill_object(alloc496) + R.vm.kill_object(model_decoder_layers_28_fc1_weight2) + R.vm.kill_object(model_decoder_layers_28_fc1_bias2) + model_decoder_layers_28_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] + model_decoder_layers_28_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1182] + gv797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc498: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv797, R.dtype("float16")) + _496: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_28_fc2_weight2, alloc497, model_decoder_layers_28_fc2_bias2, alloc498) + R.vm.kill_object(alloc497) + R.vm.kill_object(model_decoder_layers_28_fc2_weight2) + R.vm.kill_object(model_decoder_layers_28_fc2_bias2) + gv798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc499: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv798, R.dtype("float16")) + cls.add5(alloc495, alloc498, alloc499) + R.vm.kill_object(alloc495) + R.vm.kill_object(alloc498) + model_decoder_layers_29_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1192] + model_decoder_layers_29_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1193] + gv799: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc500: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv799, R.dtype("float16")) + cls.layer_norm2(alloc499, model_decoder_layers_29_self_attn_layer_norm_weight2, model_decoder_layers_29_self_attn_layer_norm_bias2, alloc500) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias2) + model_decoder_layers_29_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] + model_decoder_layers_29_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1189] + gv800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc501: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv800, R.dtype("float16")) + _499: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_q_proj_weight2, alloc500, model_decoder_layers_29_self_attn_q_proj_bias2, alloc501) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias2) + gv801: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape677: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc501, gv801, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc501) + model_decoder_layers_29_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] + gv802: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc502: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv802, R.dtype("float16")) + _500: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_29_self_attn_k_proj_weight2, alloc500, alloc502) + R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight2) + gv803: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape678: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc502, gv803, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc502) + model_decoder_layers_29_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] + model_decoder_layers_29_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1187] + gv804: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc503: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv804, R.dtype("float16")) + _501: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_v_proj_weight2, alloc500, model_decoder_layers_29_self_attn_v_proj_bias2, alloc503) + R.vm.kill_object(alloc500) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias2) + gv805: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape679: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc503, gv805, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc503) + gv806: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc504: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv806, R.dtype("float16")) + cls.concatenate1(reshape677, reshape678, reshape679, alloc504) + R.vm.kill_object(reshape677) + R.vm.kill_object(reshape678) + R.vm.kill_object(reshape679) + gv807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape680: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc504, gv807, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc504) + gv808: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc505: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv808, R.dtype("float16")) + _503: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape680, alloc505) + R.vm.kill_object(reshape680) + gv809: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape681: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc505, gv809, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc505) + gv810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape682: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape681, gv810, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape681) + model_decoder_layers_29_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] + model_decoder_layers_29_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1191] + gv811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc506: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv811, R.dtype("float16")) + _504: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_out_proj_weight2, reshape682, model_decoder_layers_29_self_attn_out_proj_bias2, alloc506) + R.vm.kill_object(reshape682) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias2) + gv812: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc507: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv812, R.dtype("float16")) + cls.add5(alloc499, alloc506, alloc507) + R.vm.kill_object(alloc499) + R.vm.kill_object(alloc506) + model_decoder_layers_29_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1201] + model_decoder_layers_29_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1202] + gv813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc508: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv813, R.dtype("float16")) + cls.layer_norm2(alloc507, model_decoder_layers_29_encoder_attn_layer_norm_weight2, model_decoder_layers_29_encoder_attn_layer_norm_bias2, alloc508) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias2) + model_decoder_layers_29_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] + model_decoder_layers_29_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1198] + gv814: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc509: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv814, R.dtype("float16")) + _507: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight2, alloc508, model_decoder_layers_29_encoder_attn_q_proj_bias2, alloc509) + R.vm.kill_object(alloc508) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias2) + gv815: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape683: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc509, gv815, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc509) + gv816: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape684: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape683, gv816, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape683) + gv817: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc510: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv817, R.dtype("float16")) + _508: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape684, alloc510) + R.vm.kill_object(reshape684) + gv818: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape685: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc510, gv818, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc510) + gv819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape686: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape685, gv819, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape685) + model_decoder_layers_29_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] + model_decoder_layers_29_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1200] + gv820: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc511: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv820, R.dtype("float16")) + _509: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight2, reshape686, model_decoder_layers_29_encoder_attn_out_proj_bias2, alloc511) + R.vm.kill_object(reshape686) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias2) + gv821: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc512: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv821, R.dtype("float16")) + cls.add5(alloc507, alloc511, alloc512) + R.vm.kill_object(alloc507) + R.vm.kill_object(alloc511) + model_decoder_layers_29_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1207] + model_decoder_layers_29_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1208] + gv822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc513: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv822, R.dtype("float16")) + cls.layer_norm2(alloc512, model_decoder_layers_29_final_layer_norm_weight2, model_decoder_layers_29_final_layer_norm_bias2, alloc513) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias2) + model_decoder_layers_29_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] + model_decoder_layers_29_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1204] + gv823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc514: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv823, R.dtype("float16")) + _512: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_29_fc1_weight2, alloc513, model_decoder_layers_29_fc1_bias2, alloc514) + R.vm.kill_object(alloc513) + R.vm.kill_object(model_decoder_layers_29_fc1_weight2) + R.vm.kill_object(model_decoder_layers_29_fc1_bias2) + model_decoder_layers_29_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] + model_decoder_layers_29_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1206] + gv824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc515: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv824, R.dtype("float16")) + _513: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_29_fc2_weight2, alloc514, model_decoder_layers_29_fc2_bias2, alloc515) + R.vm.kill_object(alloc514) + R.vm.kill_object(model_decoder_layers_29_fc2_weight2) + R.vm.kill_object(model_decoder_layers_29_fc2_bias2) + gv825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc516: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv825, R.dtype("float16")) + cls.add5(alloc512, alloc515, alloc516) + R.vm.kill_object(alloc512) + R.vm.kill_object(alloc515) + model_decoder_layers_30_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1216] + model_decoder_layers_30_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1217] + gv826: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc517: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv826, R.dtype("float16")) + cls.layer_norm2(alloc516, model_decoder_layers_30_self_attn_layer_norm_weight2, model_decoder_layers_30_self_attn_layer_norm_bias2, alloc517) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias2) + model_decoder_layers_30_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] + model_decoder_layers_30_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1213] + gv827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc518: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv827, R.dtype("float16")) + _516: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_q_proj_weight2, alloc517, model_decoder_layers_30_self_attn_q_proj_bias2, alloc518) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias2) + gv828: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape687: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc518, gv828, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc518) + model_decoder_layers_30_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] + gv829: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc519: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv829, R.dtype("float16")) + _517: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_30_self_attn_k_proj_weight2, alloc517, alloc519) + R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight2) + gv830: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape688: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc519, gv830, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc519) + model_decoder_layers_30_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] + model_decoder_layers_30_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1211] + gv831: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc520: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv831, R.dtype("float16")) + _518: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_v_proj_weight2, alloc517, model_decoder_layers_30_self_attn_v_proj_bias2, alloc520) + R.vm.kill_object(alloc517) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias2) + gv832: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape689: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc520, gv832, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc520) + gv833: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc521: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv833, R.dtype("float16")) + cls.concatenate1(reshape687, reshape688, reshape689, alloc521) + R.vm.kill_object(reshape687) + R.vm.kill_object(reshape688) + R.vm.kill_object(reshape689) + gv834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape690: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc521, gv834, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc521) + gv835: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc522: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv835, R.dtype("float16")) + _520: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape690, alloc522) + R.vm.kill_object(reshape690) + gv836: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape691: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc522, gv836, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc522) + gv837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape692: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape691, gv837, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape691) + model_decoder_layers_30_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] + model_decoder_layers_30_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1215] + gv838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc523: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv838, R.dtype("float16")) + _521: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_out_proj_weight2, reshape692, model_decoder_layers_30_self_attn_out_proj_bias2, alloc523) + R.vm.kill_object(reshape692) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias2) + gv839: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc524: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv839, R.dtype("float16")) + cls.add5(alloc516, alloc523, alloc524) + R.vm.kill_object(alloc516) + R.vm.kill_object(alloc523) + model_decoder_layers_30_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1225] + model_decoder_layers_30_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1226] + gv840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc525: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv840, R.dtype("float16")) + cls.layer_norm2(alloc524, model_decoder_layers_30_encoder_attn_layer_norm_weight2, model_decoder_layers_30_encoder_attn_layer_norm_bias2, alloc525) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias2) + model_decoder_layers_30_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] + model_decoder_layers_30_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1222] + gv841: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc526: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv841, R.dtype("float16")) + _524: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight2, alloc525, model_decoder_layers_30_encoder_attn_q_proj_bias2, alloc526) + R.vm.kill_object(alloc525) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias2) + gv842: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape693: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc526, gv842, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc526) + gv843: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape694: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape693, gv843, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape693) + gv844: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc527: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv844, R.dtype("float16")) + _525: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape694, alloc527) + R.vm.kill_object(reshape694) + gv845: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape695: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc527, gv845, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc527) + gv846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape696: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape695, gv846, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape695) + model_decoder_layers_30_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] + model_decoder_layers_30_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1224] + gv847: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc528: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv847, R.dtype("float16")) + _526: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight2, reshape696, model_decoder_layers_30_encoder_attn_out_proj_bias2, alloc528) + R.vm.kill_object(reshape696) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias2) + gv848: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc529: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv848, R.dtype("float16")) + cls.add5(alloc524, alloc528, alloc529) + R.vm.kill_object(alloc524) + R.vm.kill_object(alloc528) + model_decoder_layers_30_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1231] + model_decoder_layers_30_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1232] + gv849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc530: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv849, R.dtype("float16")) + cls.layer_norm2(alloc529, model_decoder_layers_30_final_layer_norm_weight2, model_decoder_layers_30_final_layer_norm_bias2, alloc530) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias2) + model_decoder_layers_30_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] + model_decoder_layers_30_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1228] + gv850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc531: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv850, R.dtype("float16")) + _529: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_30_fc1_weight2, alloc530, model_decoder_layers_30_fc1_bias2, alloc531) + R.vm.kill_object(alloc530) + R.vm.kill_object(model_decoder_layers_30_fc1_weight2) + R.vm.kill_object(model_decoder_layers_30_fc1_bias2) + model_decoder_layers_30_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] + model_decoder_layers_30_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1230] + gv851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc532: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv851, R.dtype("float16")) + _530: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_30_fc2_weight2, alloc531, model_decoder_layers_30_fc2_bias2, alloc532) + R.vm.kill_object(alloc531) + R.vm.kill_object(model_decoder_layers_30_fc2_weight2) + R.vm.kill_object(model_decoder_layers_30_fc2_bias2) + gv852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc533: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv852, R.dtype("float16")) + cls.add5(alloc529, alloc532, alloc533) + R.vm.kill_object(alloc529) + R.vm.kill_object(alloc532) + model_decoder_layers_31_self_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1240] + model_decoder_layers_31_self_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1241] + gv853: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc534: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv853, R.dtype("float16")) + cls.layer_norm2(alloc533, model_decoder_layers_31_self_attn_layer_norm_weight2, model_decoder_layers_31_self_attn_layer_norm_bias2, alloc534) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias2) + model_decoder_layers_31_self_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] + model_decoder_layers_31_self_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1237] + gv854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc535: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv854, R.dtype("float16")) + _533: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_q_proj_weight2, alloc534, model_decoder_layers_31_self_attn_q_proj_bias2, alloc535) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias2) + gv855: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape697: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc535, gv855, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc535) + model_decoder_layers_31_self_attn_k_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] + gv856: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc536: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv856, R.dtype("float16")) + _534: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_31_self_attn_k_proj_weight2, alloc534, alloc536) + R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight2) + gv857: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape698: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc536, gv857, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc536) + model_decoder_layers_31_self_attn_v_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] + model_decoder_layers_31_self_attn_v_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1235] + gv858: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc537: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv858, R.dtype("float16")) + _535: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_v_proj_weight2, alloc534, model_decoder_layers_31_self_attn_v_proj_bias2, alloc537) + R.vm.kill_object(alloc534) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight2) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias2) + gv859: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape699: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc537, gv859, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc537) + gv860: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc538: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv860, R.dtype("float16")) + cls.concatenate1(reshape697, reshape698, reshape699, alloc538) + R.vm.kill_object(reshape697) + R.vm.kill_object(reshape698) + R.vm.kill_object(reshape699) + gv861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape700: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc538, gv861, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc538) + gv862: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc539: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv862, R.dtype("float16")) + _537: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape700, alloc539) + R.vm.kill_object(reshape700) + gv863: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape701: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc539, gv863, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc539) + gv864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape702: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape701, gv864, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape701) + model_decoder_layers_31_self_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] + model_decoder_layers_31_self_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1239] + gv865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc540: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv865, R.dtype("float16")) + _538: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_out_proj_weight2, reshape702, model_decoder_layers_31_self_attn_out_proj_bias2, alloc540) + R.vm.kill_object(reshape702) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias2) + gv866: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc541: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv866, R.dtype("float16")) + cls.add5(alloc533, alloc540, alloc541) + R.vm.kill_object(alloc533) + R.vm.kill_object(alloc540) + model_decoder_layers_31_encoder_attn_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1249] + model_decoder_layers_31_encoder_attn_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1250] + gv867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc542: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv867, R.dtype("float16")) + cls.layer_norm2(alloc541, model_decoder_layers_31_encoder_attn_layer_norm_weight2, model_decoder_layers_31_encoder_attn_layer_norm_bias2, alloc542) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias2) + model_decoder_layers_31_encoder_attn_q_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] + model_decoder_layers_31_encoder_attn_q_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1246] + gv868: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc543: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv868, R.dtype("float16")) + _541: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight2, alloc542, model_decoder_layers_31_encoder_attn_q_proj_bias2, alloc543) + R.vm.kill_object(alloc542) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight2) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias2) + gv869: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape703: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc543, gv869, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc543) + gv870: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape704: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape703, gv870, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape703) + gv871: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc544: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv871, R.dtype("float16")) + _542: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape704, alloc544) + R.vm.kill_object(reshape704) + gv872: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape705: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc544, gv872, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc544) + gv873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape706: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape705, gv873, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape705) + model_decoder_layers_31_encoder_attn_out_proj_weight2: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] + model_decoder_layers_31_encoder_attn_out_proj_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1248] + gv874: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc545: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv874, R.dtype("float16")) + _543: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight2, reshape706, model_decoder_layers_31_encoder_attn_out_proj_bias2, alloc545) + R.vm.kill_object(reshape706) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight2) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias2) + gv875: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc546: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage6, R.prim_value(0), gv875, R.dtype("float16")) + R.vm.kill_object(storage6) + cls.add5(alloc541, alloc545, alloc546) + R.vm.kill_object(alloc541) + R.vm.kill_object(alloc545) + model_decoder_layers_31_final_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1255] + model_decoder_layers_31_final_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1256] + gv876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc547: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv876, R.dtype("float16")) + cls.layer_norm2(alloc546, model_decoder_layers_31_final_layer_norm_weight2, model_decoder_layers_31_final_layer_norm_bias2, alloc547) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight2) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias2) + model_decoder_layers_31_fc1_weight2: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] + model_decoder_layers_31_fc1_bias2: R.Tensor((5120,), dtype="float16") = packed_params[1252] + gv877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc548: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage4, R.prim_value(0), gv877, R.dtype("float16")) + R.vm.kill_object(storage4) + _546: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_31_fc1_weight2, alloc547, model_decoder_layers_31_fc1_bias2, alloc548) + R.vm.kill_object(alloc547) + R.vm.kill_object(model_decoder_layers_31_fc1_weight2) + R.vm.kill_object(model_decoder_layers_31_fc1_bias2) + model_decoder_layers_31_fc2_weight2: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] + model_decoder_layers_31_fc2_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1254] + gv878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc549: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage5, R.prim_value(0), gv878, R.dtype("float16")) + R.vm.kill_object(storage5) + _547: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_31_fc2_weight2, alloc548, model_decoder_layers_31_fc2_bias2, alloc549) + R.vm.kill_object(alloc548) + R.vm.kill_object(model_decoder_layers_31_fc2_weight2) + R.vm.kill_object(model_decoder_layers_31_fc2_bias2) + gv879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc550: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage7, R.prim_value(0), gv879, R.dtype("float16")) + R.vm.kill_object(storage7) + cls.add5(alloc546, alloc549, alloc550) + R.vm.kill_object(alloc546) + R.vm.kill_object(alloc549) + model_decoder_layer_norm_weight2: R.Tensor((1280,), dtype="float16") = packed_params[1257] + model_decoder_layer_norm_bias2: R.Tensor((1280,), dtype="float16") = packed_params[1258] + gv880: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc551: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage8, R.prim_value(0), gv880, R.dtype("float16")) + R.vm.kill_object(storage8) + cls.layer_norm2(alloc550, model_decoder_layer_norm_weight2, model_decoder_layer_norm_bias2, alloc551) + R.vm.kill_object(alloc550) + R.vm.kill_object(model_decoder_layer_norm_weight2) + R.vm.kill_object(model_decoder_layer_norm_bias2) + storage9: R.Object = R.vm.alloc_storage(R.shape([20480]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc552: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage9, R.prim_value(0), gv881, R.dtype("float16")) + R.vm.kill_object(storage9) + cls.take2(alloc551, logit_positions, alloc552) + R.vm.kill_object(alloc551) + storage10: R.Object = R.vm.alloc_storage(R.shape([1659712]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(51866), sinfo_args=(R.Shape(ndim=3),)) + alloc553: R.Tensor(dtype="float32", ndim=3) = R.vm.alloc_tensor(storage10, R.prim_value(0), gv882, R.dtype("float32")) + R.vm.kill_object(storage10) + _551: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul5_cublas", model_decoder_embed_tokens_weight2, alloc552, alloc553) + R.vm.kill_object(model_decoder_embed_tokens_weight2) + R.vm.kill_object(alloc552) + R.call_packed("vm.builtin.match_shape", alloc553, shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(51866), R.str("ErrorContext(fn=batch_prefill, loc=return, annotation=R.Tensor((1, batch_size, 51866), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + return alloc553 + + @R.function + def create_tir_paged_kv_cache(max_batch_size_: R.Shape(["max_batch_size"]), max_total_seq_len_: R.Shape(["max_total_seq_len"]), prefill_chunk_size_: R.Shape(["prefill_chunk_size"]), page_size_: R.Shape(["page_size"]), support_sliding_window_: R.Shape(["support_sliding_window"])) -> R.Object: + max_batch_size = T.int64() + max_total_seq_len = T.int64() + prefill_chunk_size = T.int64() + page_size = T.int64() + support_sliding_window = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_shape_info", max_batch_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[0], param=max_batch_size_, annotation=R.Shape([max_batch_size])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_shape_info", max_total_seq_len_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[1], param=max_total_seq_len_, annotation=R.Shape([max_total_seq_len])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_shape_info", prefill_chunk_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[2], param=prefill_chunk_size_, annotation=R.Shape([prefill_chunk_size])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_shape_info", page_size_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[3], param=page_size_, annotation=R.Shape([page_size])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_shape_info", support_sliding_window_, R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[4], param=support_sliding_window_, annotation=R.Shape([support_sliding_window])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", max_batch_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[0], param=max_batch_size_, annotation=R.Shape([max_batch_size])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", max_total_seq_len_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[1], param=max_total_seq_len_, annotation=R.Shape([max_total_seq_len])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", prefill_chunk_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[2], param=prefill_chunk_size_, annotation=R.Shape([prefill_chunk_size])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", page_size_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[3], param=page_size_, annotation=R.Shape([page_size])) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", support_sliding_window_, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), R.str("ErrorContext(fn=create_tir_paged_kv_cache, loc=param[4], param=support_sliding_window_, annotation=R.Shape([support_sliding_window])) "), sinfo_args=(R.Tuple,)) + gv2559: R.Shape(ndim=5) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(5), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.prim_value(2), R.prim_value(1), R.prim_value(3), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=5),)) + paged_kv_cache: R.Object = R.call_packed("vm.builtin.paged_attention_kv_cache_create_reduced", gv2559, R.prim_value(32), R.prim_value(20), R.prim_value(20), R.prim_value(64), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.const(0, "float16"), cls.tir_kv_cache_transpose_append, cls.batch_prefill_paged_kv, cls.batch_decode_paged_kv, cls.batch_prefill_paged_kv_sliding_window, cls.batch_decode_paged_kv_sliding_window, cls.batch_prefill_ragged_kv, cls.merge_state_inplace, cls.fused_rope, cls.copy_single_page, cls.tir_kv_cache_debug_get_kv, cls.compact_kv_copy, cls.batch_tree_attn, sinfo_args=(R.Object,)) + return paged_kv_cache + + @R.function + def decode(input_ids: R.Tensor((1, 1), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, 1, 51866), dtype="float32"): + R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(1),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=decode, loc=param[0], param=input_ids, annotation=R.Tensor((1, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=decode, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.str("ErrorContext(fn=decode, loc=param[0], param=input_ids, annotation=R.Tensor((1, 1), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + model_decoder_embed_tokens_weight5: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + reshape1353: R.Tensor((1,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, R.shape([1]), sinfo_args=(R.Tensor((1,), dtype="int32"),)) + model_decoder_embed_tokens_weight5_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + storage19: R.Object = R.vm.alloc_storage(R.shape([10240]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc1167: R.Tensor((1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1280]), R.dtype("float16")) + cls.take3(model_decoder_embed_tokens_weight5_1, reshape1353, alloc1167) + R.vm.kill_object(reshape1353) + R.vm.kill_object(model_decoder_embed_tokens_weight5_1) + lv264: R.Tensor((1,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((1,), dtype="int32"),)) + model_decoder_embed_positions_weight5: R.Tensor((448, 1280), dtype="float16") = packed_params[488] + storage20: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc1168: R.Tensor((1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1280]), R.dtype("float16")) + cls.take4(model_decoder_embed_positions_weight5, lv264, alloc1168) + R.vm.kill_object(lv264) + R.vm.kill_object(model_decoder_embed_positions_weight5) + storage21: R.Object = R.vm.alloc_storage(R.shape([2560]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc1169: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_reshape20_reshape20_add6(alloc1167, alloc1168, alloc1169) + R.vm.kill_object(alloc1167) + R.vm.kill_object(alloc1168) + model_decoder_layers_0_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[496] + model_decoder_layers_0_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[497] + alloc1170: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1169, model_decoder_layers_0_self_attn_layer_norm_weight5, model_decoder_layers_0_self_attn_layer_norm_bias5, alloc1170) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias5) + model_decoder_layers_0_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] + model_decoder_layers_0_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[493] + alloc1171: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1170, model_decoder_layers_0_self_attn_q_proj_weight5, model_decoder_layers_0_self_attn_q_proj_bias5, alloc1171) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias5) + model_decoder_layers_0_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] + storage22: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc1172: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1170, model_decoder_layers_0_self_attn_k_proj_weight5, alloc1172) + R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight5) + model_decoder_layers_0_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] + model_decoder_layers_0_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[491] + storage23: R.Object = R.vm.alloc_storage(R.shape([7680]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc1173: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1170, model_decoder_layers_0_self_attn_v_proj_weight5, model_decoder_layers_0_self_attn_v_proj_bias5, alloc1173) + R.vm.kill_object(alloc1170) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias5) + alloc1174: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1171, alloc1172, alloc1173, alloc1174) + R.vm.kill_object(alloc1171) + R.vm.kill_object(alloc1172) + R.vm.kill_object(alloc1173) + alloc1175: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1173: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), alloc1174, alloc1175) + R.vm.kill_object(alloc1174) + lv44: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1175, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1175) + model_decoder_layers_0_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] + model_decoder_layers_0_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[495] + alloc1176: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv44, model_decoder_layers_0_self_attn_out_proj_weight5, model_decoder_layers_0_self_attn_out_proj_bias5, alloc1169, alloc1176) + R.vm.kill_object(alloc1169) + R.vm.kill_object(lv44) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias5) + model_decoder_layers_0_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[505] + model_decoder_layers_0_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[506] + alloc1177: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1176, model_decoder_layers_0_encoder_attn_layer_norm_weight5, model_decoder_layers_0_encoder_attn_layer_norm_bias5, alloc1177) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias5) + model_decoder_layers_0_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] + model_decoder_layers_0_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[502] + alloc1178: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1177, model_decoder_layers_0_encoder_attn_q_proj_weight5, model_decoder_layers_0_encoder_attn_q_proj_bias5, alloc1178) + R.vm.kill_object(alloc1177) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias5) + lv47: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1178, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1178) + alloc1179: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1177: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), lv47, alloc1179) + R.vm.kill_object(lv47) + lv48: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1179, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1179) + model_decoder_layers_0_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] + model_decoder_layers_0_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[504] + alloc1180: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv48, model_decoder_layers_0_encoder_attn_out_proj_weight5, model_decoder_layers_0_encoder_attn_out_proj_bias5, alloc1176, alloc1180) + R.vm.kill_object(alloc1176) + R.vm.kill_object(lv48) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias5) + model_decoder_layers_0_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[511] + model_decoder_layers_0_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[512] + alloc1181: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1180, model_decoder_layers_0_final_layer_norm_weight5, model_decoder_layers_0_final_layer_norm_bias5, alloc1181) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias5) + model_decoder_layers_0_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] + model_decoder_layers_0_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[508] + alloc1182: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1181, model_decoder_layers_0_fc1_weight5, model_decoder_layers_0_fc1_bias5, alloc1182) + R.vm.kill_object(alloc1181) + R.vm.kill_object(model_decoder_layers_0_fc1_weight5) + R.vm.kill_object(model_decoder_layers_0_fc1_bias5) + model_decoder_layers_0_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] + model_decoder_layers_0_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[510] + alloc1183: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1182, model_decoder_layers_0_fc2_weight5, model_decoder_layers_0_fc2_bias5, alloc1180, alloc1183) + R.vm.kill_object(alloc1180) + R.vm.kill_object(alloc1182) + R.vm.kill_object(model_decoder_layers_0_fc2_weight5) + R.vm.kill_object(model_decoder_layers_0_fc2_bias5) + model_decoder_layers_1_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[520] + model_decoder_layers_1_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[521] + alloc1184: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1183, model_decoder_layers_1_self_attn_layer_norm_weight5, model_decoder_layers_1_self_attn_layer_norm_bias5, alloc1184) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias5) + model_decoder_layers_1_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] + model_decoder_layers_1_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[517] + alloc1185: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1184, model_decoder_layers_1_self_attn_q_proj_weight5, model_decoder_layers_1_self_attn_q_proj_bias5, alloc1185) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias5) + model_decoder_layers_1_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] + alloc1186: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1184, model_decoder_layers_1_self_attn_k_proj_weight5, alloc1186) + R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight5) + model_decoder_layers_1_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] + model_decoder_layers_1_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[515] + alloc1187: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1184, model_decoder_layers_1_self_attn_v_proj_weight5, model_decoder_layers_1_self_attn_v_proj_bias5, alloc1187) + R.vm.kill_object(alloc1184) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias5) + alloc1188: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1185, alloc1186, alloc1187, alloc1188) + R.vm.kill_object(alloc1185) + R.vm.kill_object(alloc1186) + R.vm.kill_object(alloc1187) + alloc1189: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1187: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), alloc1188, alloc1189) + R.vm.kill_object(alloc1188) + lv55: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1189, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1189) + model_decoder_layers_1_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] + model_decoder_layers_1_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[519] + alloc1190: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv55, model_decoder_layers_1_self_attn_out_proj_weight5, model_decoder_layers_1_self_attn_out_proj_bias5, alloc1183, alloc1190) + R.vm.kill_object(alloc1183) + R.vm.kill_object(lv55) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias5) + model_decoder_layers_1_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[529] + model_decoder_layers_1_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[530] + alloc1191: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1190, model_decoder_layers_1_encoder_attn_layer_norm_weight5, model_decoder_layers_1_encoder_attn_layer_norm_bias5, alloc1191) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias5) + model_decoder_layers_1_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] + model_decoder_layers_1_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[526] + alloc1192: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1191, model_decoder_layers_1_encoder_attn_q_proj_weight5, model_decoder_layers_1_encoder_attn_q_proj_bias5, alloc1192) + R.vm.kill_object(alloc1191) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias5) + lv58: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1192, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1192) + alloc1193: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1191: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), lv58, alloc1193) + R.vm.kill_object(lv58) + lv59: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1193, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1193) + model_decoder_layers_1_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] + model_decoder_layers_1_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[528] + alloc1194: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv59, model_decoder_layers_1_encoder_attn_out_proj_weight5, model_decoder_layers_1_encoder_attn_out_proj_bias5, alloc1190, alloc1194) + R.vm.kill_object(alloc1190) + R.vm.kill_object(lv59) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias5) + model_decoder_layers_1_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[535] + model_decoder_layers_1_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[536] + alloc1195: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1194, model_decoder_layers_1_final_layer_norm_weight5, model_decoder_layers_1_final_layer_norm_bias5, alloc1195) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias5) + model_decoder_layers_1_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] + model_decoder_layers_1_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[532] + alloc1196: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1195, model_decoder_layers_1_fc1_weight5, model_decoder_layers_1_fc1_bias5, alloc1196) + R.vm.kill_object(alloc1195) + R.vm.kill_object(model_decoder_layers_1_fc1_weight5) + R.vm.kill_object(model_decoder_layers_1_fc1_bias5) + model_decoder_layers_1_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] + model_decoder_layers_1_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[534] + alloc1197: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1196, model_decoder_layers_1_fc2_weight5, model_decoder_layers_1_fc2_bias5, alloc1194, alloc1197) + R.vm.kill_object(alloc1194) + R.vm.kill_object(alloc1196) + R.vm.kill_object(model_decoder_layers_1_fc2_weight5) + R.vm.kill_object(model_decoder_layers_1_fc2_bias5) + model_decoder_layers_2_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[544] + model_decoder_layers_2_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[545] + alloc1198: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1197, model_decoder_layers_2_self_attn_layer_norm_weight5, model_decoder_layers_2_self_attn_layer_norm_bias5, alloc1198) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias5) + model_decoder_layers_2_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] + model_decoder_layers_2_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[541] + alloc1199: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1198, model_decoder_layers_2_self_attn_q_proj_weight5, model_decoder_layers_2_self_attn_q_proj_bias5, alloc1199) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias5) + model_decoder_layers_2_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] + alloc1200: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1198, model_decoder_layers_2_self_attn_k_proj_weight5, alloc1200) + R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight5) + model_decoder_layers_2_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] + model_decoder_layers_2_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[539] + alloc1201: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1198, model_decoder_layers_2_self_attn_v_proj_weight5, model_decoder_layers_2_self_attn_v_proj_bias5, alloc1201) + R.vm.kill_object(alloc1198) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias5) + alloc1202: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1199, alloc1200, alloc1201, alloc1202) + R.vm.kill_object(alloc1199) + R.vm.kill_object(alloc1200) + R.vm.kill_object(alloc1201) + alloc1203: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1201: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), alloc1202, alloc1203) + R.vm.kill_object(alloc1202) + lv66: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1203, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1203) + model_decoder_layers_2_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] + model_decoder_layers_2_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[543] + alloc1204: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv66, model_decoder_layers_2_self_attn_out_proj_weight5, model_decoder_layers_2_self_attn_out_proj_bias5, alloc1197, alloc1204) + R.vm.kill_object(alloc1197) + R.vm.kill_object(lv66) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias5) + model_decoder_layers_2_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[553] + model_decoder_layers_2_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[554] + alloc1205: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1204, model_decoder_layers_2_encoder_attn_layer_norm_weight5, model_decoder_layers_2_encoder_attn_layer_norm_bias5, alloc1205) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias5) + model_decoder_layers_2_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] + model_decoder_layers_2_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[550] + alloc1206: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1205, model_decoder_layers_2_encoder_attn_q_proj_weight5, model_decoder_layers_2_encoder_attn_q_proj_bias5, alloc1206) + R.vm.kill_object(alloc1205) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias5) + lv69: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1206, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1206) + alloc1207: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1205: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), lv69, alloc1207) + R.vm.kill_object(lv69) + lv70: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1207, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1207) + model_decoder_layers_2_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] + model_decoder_layers_2_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[552] + alloc1208: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv70, model_decoder_layers_2_encoder_attn_out_proj_weight5, model_decoder_layers_2_encoder_attn_out_proj_bias5, alloc1204, alloc1208) + R.vm.kill_object(alloc1204) + R.vm.kill_object(lv70) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias5) + model_decoder_layers_2_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[559] + model_decoder_layers_2_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[560] + alloc1209: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1208, model_decoder_layers_2_final_layer_norm_weight5, model_decoder_layers_2_final_layer_norm_bias5, alloc1209) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias5) + model_decoder_layers_2_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] + model_decoder_layers_2_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[556] + alloc1210: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1209, model_decoder_layers_2_fc1_weight5, model_decoder_layers_2_fc1_bias5, alloc1210) + R.vm.kill_object(alloc1209) + R.vm.kill_object(model_decoder_layers_2_fc1_weight5) + R.vm.kill_object(model_decoder_layers_2_fc1_bias5) + model_decoder_layers_2_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] + model_decoder_layers_2_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[558] + alloc1211: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1210, model_decoder_layers_2_fc2_weight5, model_decoder_layers_2_fc2_bias5, alloc1208, alloc1211) + R.vm.kill_object(alloc1208) + R.vm.kill_object(alloc1210) + R.vm.kill_object(model_decoder_layers_2_fc2_weight5) + R.vm.kill_object(model_decoder_layers_2_fc2_bias5) + model_decoder_layers_3_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[568] + model_decoder_layers_3_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[569] + alloc1212: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1211, model_decoder_layers_3_self_attn_layer_norm_weight5, model_decoder_layers_3_self_attn_layer_norm_bias5, alloc1212) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias5) + model_decoder_layers_3_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] + model_decoder_layers_3_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[565] + alloc1213: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1212, model_decoder_layers_3_self_attn_q_proj_weight5, model_decoder_layers_3_self_attn_q_proj_bias5, alloc1213) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias5) + model_decoder_layers_3_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] + alloc1214: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1212, model_decoder_layers_3_self_attn_k_proj_weight5, alloc1214) + R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight5) + model_decoder_layers_3_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] + model_decoder_layers_3_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[563] + alloc1215: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1212, model_decoder_layers_3_self_attn_v_proj_weight5, model_decoder_layers_3_self_attn_v_proj_bias5, alloc1215) + R.vm.kill_object(alloc1212) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias5) + alloc1216: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1213, alloc1214, alloc1215, alloc1216) + R.vm.kill_object(alloc1213) + R.vm.kill_object(alloc1214) + R.vm.kill_object(alloc1215) + alloc1217: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1215: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), alloc1216, alloc1217) + R.vm.kill_object(alloc1216) + lv77: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1217, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1217) + model_decoder_layers_3_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] + model_decoder_layers_3_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[567] + alloc1218: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv77, model_decoder_layers_3_self_attn_out_proj_weight5, model_decoder_layers_3_self_attn_out_proj_bias5, alloc1211, alloc1218) + R.vm.kill_object(alloc1211) + R.vm.kill_object(lv77) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias5) + model_decoder_layers_3_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[577] + model_decoder_layers_3_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[578] + alloc1219: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1218, model_decoder_layers_3_encoder_attn_layer_norm_weight5, model_decoder_layers_3_encoder_attn_layer_norm_bias5, alloc1219) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias5) + model_decoder_layers_3_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] + model_decoder_layers_3_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[574] + alloc1220: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1219, model_decoder_layers_3_encoder_attn_q_proj_weight5, model_decoder_layers_3_encoder_attn_q_proj_bias5, alloc1220) + R.vm.kill_object(alloc1219) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias5) + lv80: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1220, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1220) + alloc1221: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1219: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), lv80, alloc1221) + R.vm.kill_object(lv80) + lv81: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1221, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1221) + model_decoder_layers_3_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] + model_decoder_layers_3_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[576] + alloc1222: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv81, model_decoder_layers_3_encoder_attn_out_proj_weight5, model_decoder_layers_3_encoder_attn_out_proj_bias5, alloc1218, alloc1222) + R.vm.kill_object(alloc1218) + R.vm.kill_object(lv81) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias5) + model_decoder_layers_3_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[583] + model_decoder_layers_3_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[584] + alloc1223: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1222, model_decoder_layers_3_final_layer_norm_weight5, model_decoder_layers_3_final_layer_norm_bias5, alloc1223) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias5) + model_decoder_layers_3_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] + model_decoder_layers_3_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[580] + alloc1224: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1223, model_decoder_layers_3_fc1_weight5, model_decoder_layers_3_fc1_bias5, alloc1224) + R.vm.kill_object(alloc1223) + R.vm.kill_object(model_decoder_layers_3_fc1_weight5) + R.vm.kill_object(model_decoder_layers_3_fc1_bias5) + model_decoder_layers_3_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] + model_decoder_layers_3_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[582] + alloc1225: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1224, model_decoder_layers_3_fc2_weight5, model_decoder_layers_3_fc2_bias5, alloc1222, alloc1225) + R.vm.kill_object(alloc1222) + R.vm.kill_object(alloc1224) + R.vm.kill_object(model_decoder_layers_3_fc2_weight5) + R.vm.kill_object(model_decoder_layers_3_fc2_bias5) + model_decoder_layers_4_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[592] + model_decoder_layers_4_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[593] + alloc1226: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1225, model_decoder_layers_4_self_attn_layer_norm_weight5, model_decoder_layers_4_self_attn_layer_norm_bias5, alloc1226) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias5) + model_decoder_layers_4_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] + model_decoder_layers_4_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[589] + alloc1227: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1226, model_decoder_layers_4_self_attn_q_proj_weight5, model_decoder_layers_4_self_attn_q_proj_bias5, alloc1227) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias5) + model_decoder_layers_4_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] + alloc1228: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1226, model_decoder_layers_4_self_attn_k_proj_weight5, alloc1228) + R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight5) + model_decoder_layers_4_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] + model_decoder_layers_4_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[587] + alloc1229: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1226, model_decoder_layers_4_self_attn_v_proj_weight5, model_decoder_layers_4_self_attn_v_proj_bias5, alloc1229) + R.vm.kill_object(alloc1226) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias5) + alloc1230: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1227, alloc1228, alloc1229, alloc1230) + R.vm.kill_object(alloc1227) + R.vm.kill_object(alloc1228) + R.vm.kill_object(alloc1229) + alloc1231: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1229: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), alloc1230, alloc1231) + R.vm.kill_object(alloc1230) + lv88: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1231, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1231) + model_decoder_layers_4_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] + model_decoder_layers_4_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[591] + alloc1232: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv88, model_decoder_layers_4_self_attn_out_proj_weight5, model_decoder_layers_4_self_attn_out_proj_bias5, alloc1225, alloc1232) + R.vm.kill_object(alloc1225) + R.vm.kill_object(lv88) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias5) + model_decoder_layers_4_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[601] + model_decoder_layers_4_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[602] + alloc1233: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1232, model_decoder_layers_4_encoder_attn_layer_norm_weight5, model_decoder_layers_4_encoder_attn_layer_norm_bias5, alloc1233) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias5) + model_decoder_layers_4_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] + model_decoder_layers_4_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[598] + alloc1234: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1233, model_decoder_layers_4_encoder_attn_q_proj_weight5, model_decoder_layers_4_encoder_attn_q_proj_bias5, alloc1234) + R.vm.kill_object(alloc1233) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias5) + lv91: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1234, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1234) + alloc1235: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1233: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), lv91, alloc1235) + R.vm.kill_object(lv91) + lv92: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1235, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1235) + model_decoder_layers_4_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] + model_decoder_layers_4_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[600] + alloc1236: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv92, model_decoder_layers_4_encoder_attn_out_proj_weight5, model_decoder_layers_4_encoder_attn_out_proj_bias5, alloc1232, alloc1236) + R.vm.kill_object(alloc1232) + R.vm.kill_object(lv92) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias5) + model_decoder_layers_4_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[607] + model_decoder_layers_4_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[608] + alloc1237: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1236, model_decoder_layers_4_final_layer_norm_weight5, model_decoder_layers_4_final_layer_norm_bias5, alloc1237) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias5) + model_decoder_layers_4_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] + model_decoder_layers_4_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[604] + alloc1238: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1237, model_decoder_layers_4_fc1_weight5, model_decoder_layers_4_fc1_bias5, alloc1238) + R.vm.kill_object(alloc1237) + R.vm.kill_object(model_decoder_layers_4_fc1_weight5) + R.vm.kill_object(model_decoder_layers_4_fc1_bias5) + model_decoder_layers_4_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] + model_decoder_layers_4_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[606] + alloc1239: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1238, model_decoder_layers_4_fc2_weight5, model_decoder_layers_4_fc2_bias5, alloc1236, alloc1239) + R.vm.kill_object(alloc1236) + R.vm.kill_object(alloc1238) + R.vm.kill_object(model_decoder_layers_4_fc2_weight5) + R.vm.kill_object(model_decoder_layers_4_fc2_bias5) + model_decoder_layers_5_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[616] + model_decoder_layers_5_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[617] + alloc1240: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1239, model_decoder_layers_5_self_attn_layer_norm_weight5, model_decoder_layers_5_self_attn_layer_norm_bias5, alloc1240) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias5) + model_decoder_layers_5_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] + model_decoder_layers_5_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[613] + alloc1241: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1240, model_decoder_layers_5_self_attn_q_proj_weight5, model_decoder_layers_5_self_attn_q_proj_bias5, alloc1241) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias5) + model_decoder_layers_5_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] + alloc1242: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1240, model_decoder_layers_5_self_attn_k_proj_weight5, alloc1242) + R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight5) + model_decoder_layers_5_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] + model_decoder_layers_5_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[611] + alloc1243: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1240, model_decoder_layers_5_self_attn_v_proj_weight5, model_decoder_layers_5_self_attn_v_proj_bias5, alloc1243) + R.vm.kill_object(alloc1240) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias5) + alloc1244: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1241, alloc1242, alloc1243, alloc1244) + R.vm.kill_object(alloc1241) + R.vm.kill_object(alloc1242) + R.vm.kill_object(alloc1243) + alloc1245: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1243: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), alloc1244, alloc1245) + R.vm.kill_object(alloc1244) + lv99: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1245, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1245) + model_decoder_layers_5_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] + model_decoder_layers_5_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[615] + alloc1246: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv99, model_decoder_layers_5_self_attn_out_proj_weight5, model_decoder_layers_5_self_attn_out_proj_bias5, alloc1239, alloc1246) + R.vm.kill_object(alloc1239) + R.vm.kill_object(lv99) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias5) + model_decoder_layers_5_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[625] + model_decoder_layers_5_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[626] + alloc1247: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1246, model_decoder_layers_5_encoder_attn_layer_norm_weight5, model_decoder_layers_5_encoder_attn_layer_norm_bias5, alloc1247) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias5) + model_decoder_layers_5_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] + model_decoder_layers_5_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[622] + alloc1248: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1247, model_decoder_layers_5_encoder_attn_q_proj_weight5, model_decoder_layers_5_encoder_attn_q_proj_bias5, alloc1248) + R.vm.kill_object(alloc1247) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias5) + lv102: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1248, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1248) + alloc1249: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1247: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), lv102, alloc1249) + R.vm.kill_object(lv102) + lv103: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1249, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1249) + model_decoder_layers_5_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] + model_decoder_layers_5_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[624] + alloc1250: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv103, model_decoder_layers_5_encoder_attn_out_proj_weight5, model_decoder_layers_5_encoder_attn_out_proj_bias5, alloc1246, alloc1250) + R.vm.kill_object(alloc1246) + R.vm.kill_object(lv103) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias5) + model_decoder_layers_5_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[631] + model_decoder_layers_5_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[632] + alloc1251: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1250, model_decoder_layers_5_final_layer_norm_weight5, model_decoder_layers_5_final_layer_norm_bias5, alloc1251) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias5) + model_decoder_layers_5_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] + model_decoder_layers_5_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[628] + alloc1252: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1251, model_decoder_layers_5_fc1_weight5, model_decoder_layers_5_fc1_bias5, alloc1252) + R.vm.kill_object(alloc1251) + R.vm.kill_object(model_decoder_layers_5_fc1_weight5) + R.vm.kill_object(model_decoder_layers_5_fc1_bias5) + model_decoder_layers_5_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] + model_decoder_layers_5_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[630] + alloc1253: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1252, model_decoder_layers_5_fc2_weight5, model_decoder_layers_5_fc2_bias5, alloc1250, alloc1253) + R.vm.kill_object(alloc1250) + R.vm.kill_object(alloc1252) + R.vm.kill_object(model_decoder_layers_5_fc2_weight5) + R.vm.kill_object(model_decoder_layers_5_fc2_bias5) + model_decoder_layers_6_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[640] + model_decoder_layers_6_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[641] + alloc1254: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1253, model_decoder_layers_6_self_attn_layer_norm_weight5, model_decoder_layers_6_self_attn_layer_norm_bias5, alloc1254) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias5) + model_decoder_layers_6_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] + model_decoder_layers_6_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[637] + alloc1255: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1254, model_decoder_layers_6_self_attn_q_proj_weight5, model_decoder_layers_6_self_attn_q_proj_bias5, alloc1255) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias5) + model_decoder_layers_6_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] + alloc1256: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1254, model_decoder_layers_6_self_attn_k_proj_weight5, alloc1256) + R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight5) + model_decoder_layers_6_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] + model_decoder_layers_6_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[635] + alloc1257: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1254, model_decoder_layers_6_self_attn_v_proj_weight5, model_decoder_layers_6_self_attn_v_proj_bias5, alloc1257) + R.vm.kill_object(alloc1254) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias5) + alloc1258: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1255, alloc1256, alloc1257, alloc1258) + R.vm.kill_object(alloc1255) + R.vm.kill_object(alloc1256) + R.vm.kill_object(alloc1257) + alloc1259: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1257: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), alloc1258, alloc1259) + R.vm.kill_object(alloc1258) + lv110: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1259, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1259) + model_decoder_layers_6_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] + model_decoder_layers_6_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[639] + alloc1260: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv110, model_decoder_layers_6_self_attn_out_proj_weight5, model_decoder_layers_6_self_attn_out_proj_bias5, alloc1253, alloc1260) + R.vm.kill_object(alloc1253) + R.vm.kill_object(lv110) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias5) + model_decoder_layers_6_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[649] + model_decoder_layers_6_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[650] + alloc1261: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1260, model_decoder_layers_6_encoder_attn_layer_norm_weight5, model_decoder_layers_6_encoder_attn_layer_norm_bias5, alloc1261) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias5) + model_decoder_layers_6_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] + model_decoder_layers_6_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[646] + alloc1262: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1261, model_decoder_layers_6_encoder_attn_q_proj_weight5, model_decoder_layers_6_encoder_attn_q_proj_bias5, alloc1262) + R.vm.kill_object(alloc1261) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias5) + lv113: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1262, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1262) + alloc1263: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1261: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), lv113, alloc1263) + R.vm.kill_object(lv113) + lv114: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1263, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1263) + model_decoder_layers_6_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] + model_decoder_layers_6_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[648] + alloc1264: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv114, model_decoder_layers_6_encoder_attn_out_proj_weight5, model_decoder_layers_6_encoder_attn_out_proj_bias5, alloc1260, alloc1264) + R.vm.kill_object(alloc1260) + R.vm.kill_object(lv114) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias5) + model_decoder_layers_6_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[655] + model_decoder_layers_6_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[656] + alloc1265: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1264, model_decoder_layers_6_final_layer_norm_weight5, model_decoder_layers_6_final_layer_norm_bias5, alloc1265) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias5) + model_decoder_layers_6_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] + model_decoder_layers_6_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[652] + alloc1266: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1265, model_decoder_layers_6_fc1_weight5, model_decoder_layers_6_fc1_bias5, alloc1266) + R.vm.kill_object(alloc1265) + R.vm.kill_object(model_decoder_layers_6_fc1_weight5) + R.vm.kill_object(model_decoder_layers_6_fc1_bias5) + model_decoder_layers_6_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] + model_decoder_layers_6_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[654] + alloc1267: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1266, model_decoder_layers_6_fc2_weight5, model_decoder_layers_6_fc2_bias5, alloc1264, alloc1267) + R.vm.kill_object(alloc1264) + R.vm.kill_object(alloc1266) + R.vm.kill_object(model_decoder_layers_6_fc2_weight5) + R.vm.kill_object(model_decoder_layers_6_fc2_bias5) + model_decoder_layers_7_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[664] + model_decoder_layers_7_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[665] + alloc1268: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1267, model_decoder_layers_7_self_attn_layer_norm_weight5, model_decoder_layers_7_self_attn_layer_norm_bias5, alloc1268) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias5) + model_decoder_layers_7_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] + model_decoder_layers_7_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[661] + alloc1269: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1268, model_decoder_layers_7_self_attn_q_proj_weight5, model_decoder_layers_7_self_attn_q_proj_bias5, alloc1269) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias5) + model_decoder_layers_7_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] + alloc1270: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1268, model_decoder_layers_7_self_attn_k_proj_weight5, alloc1270) + R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight5) + model_decoder_layers_7_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] + model_decoder_layers_7_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[659] + alloc1271: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1268, model_decoder_layers_7_self_attn_v_proj_weight5, model_decoder_layers_7_self_attn_v_proj_bias5, alloc1271) + R.vm.kill_object(alloc1268) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias5) + alloc1272: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1269, alloc1270, alloc1271, alloc1272) + R.vm.kill_object(alloc1269) + R.vm.kill_object(alloc1270) + R.vm.kill_object(alloc1271) + alloc1273: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1271: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), alloc1272, alloc1273) + R.vm.kill_object(alloc1272) + lv121: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1273, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1273) + model_decoder_layers_7_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] + model_decoder_layers_7_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[663] + alloc1274: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv121, model_decoder_layers_7_self_attn_out_proj_weight5, model_decoder_layers_7_self_attn_out_proj_bias5, alloc1267, alloc1274) + R.vm.kill_object(alloc1267) + R.vm.kill_object(lv121) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias5) + model_decoder_layers_7_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[673] + model_decoder_layers_7_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[674] + alloc1275: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1274, model_decoder_layers_7_encoder_attn_layer_norm_weight5, model_decoder_layers_7_encoder_attn_layer_norm_bias5, alloc1275) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias5) + model_decoder_layers_7_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] + model_decoder_layers_7_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[670] + alloc1276: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1275, model_decoder_layers_7_encoder_attn_q_proj_weight5, model_decoder_layers_7_encoder_attn_q_proj_bias5, alloc1276) + R.vm.kill_object(alloc1275) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias5) + lv124: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1276, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1276) + alloc1277: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1275: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), lv124, alloc1277) + R.vm.kill_object(lv124) + lv125: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1277, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1277) + model_decoder_layers_7_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] + model_decoder_layers_7_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[672] + alloc1278: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv125, model_decoder_layers_7_encoder_attn_out_proj_weight5, model_decoder_layers_7_encoder_attn_out_proj_bias5, alloc1274, alloc1278) + R.vm.kill_object(alloc1274) + R.vm.kill_object(lv125) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias5) + model_decoder_layers_7_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[679] + model_decoder_layers_7_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[680] + alloc1279: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1278, model_decoder_layers_7_final_layer_norm_weight5, model_decoder_layers_7_final_layer_norm_bias5, alloc1279) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias5) + model_decoder_layers_7_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] + model_decoder_layers_7_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[676] + alloc1280: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1279, model_decoder_layers_7_fc1_weight5, model_decoder_layers_7_fc1_bias5, alloc1280) + R.vm.kill_object(alloc1279) + R.vm.kill_object(model_decoder_layers_7_fc1_weight5) + R.vm.kill_object(model_decoder_layers_7_fc1_bias5) + model_decoder_layers_7_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] + model_decoder_layers_7_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[678] + alloc1281: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1280, model_decoder_layers_7_fc2_weight5, model_decoder_layers_7_fc2_bias5, alloc1278, alloc1281) + R.vm.kill_object(alloc1278) + R.vm.kill_object(alloc1280) + R.vm.kill_object(model_decoder_layers_7_fc2_weight5) + R.vm.kill_object(model_decoder_layers_7_fc2_bias5) + model_decoder_layers_8_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[688] + model_decoder_layers_8_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[689] + alloc1282: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1281, model_decoder_layers_8_self_attn_layer_norm_weight5, model_decoder_layers_8_self_attn_layer_norm_bias5, alloc1282) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias5) + model_decoder_layers_8_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] + model_decoder_layers_8_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[685] + alloc1283: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1282, model_decoder_layers_8_self_attn_q_proj_weight5, model_decoder_layers_8_self_attn_q_proj_bias5, alloc1283) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias5) + model_decoder_layers_8_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] + alloc1284: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1282, model_decoder_layers_8_self_attn_k_proj_weight5, alloc1284) + R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight5) + model_decoder_layers_8_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] + model_decoder_layers_8_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[683] + alloc1285: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1282, model_decoder_layers_8_self_attn_v_proj_weight5, model_decoder_layers_8_self_attn_v_proj_bias5, alloc1285) + R.vm.kill_object(alloc1282) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias5) + alloc1286: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1283, alloc1284, alloc1285, alloc1286) + R.vm.kill_object(alloc1283) + R.vm.kill_object(alloc1284) + R.vm.kill_object(alloc1285) + alloc1287: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1285: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), alloc1286, alloc1287) + R.vm.kill_object(alloc1286) + lv132: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1287, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1287) + model_decoder_layers_8_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] + model_decoder_layers_8_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[687] + alloc1288: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv132, model_decoder_layers_8_self_attn_out_proj_weight5, model_decoder_layers_8_self_attn_out_proj_bias5, alloc1281, alloc1288) + R.vm.kill_object(alloc1281) + R.vm.kill_object(lv132) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias5) + model_decoder_layers_8_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[697] + model_decoder_layers_8_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[698] + alloc1289: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1288, model_decoder_layers_8_encoder_attn_layer_norm_weight5, model_decoder_layers_8_encoder_attn_layer_norm_bias5, alloc1289) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias5) + model_decoder_layers_8_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] + model_decoder_layers_8_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[694] + alloc1290: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1289, model_decoder_layers_8_encoder_attn_q_proj_weight5, model_decoder_layers_8_encoder_attn_q_proj_bias5, alloc1290) + R.vm.kill_object(alloc1289) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias5) + lv135: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1290, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1290) + alloc1291: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1289: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), lv135, alloc1291) + R.vm.kill_object(lv135) + lv136: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1291, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1291) + model_decoder_layers_8_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] + model_decoder_layers_8_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[696] + alloc1292: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv136, model_decoder_layers_8_encoder_attn_out_proj_weight5, model_decoder_layers_8_encoder_attn_out_proj_bias5, alloc1288, alloc1292) + R.vm.kill_object(alloc1288) + R.vm.kill_object(lv136) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias5) + model_decoder_layers_8_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[703] + model_decoder_layers_8_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[704] + alloc1293: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1292, model_decoder_layers_8_final_layer_norm_weight5, model_decoder_layers_8_final_layer_norm_bias5, alloc1293) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias5) + model_decoder_layers_8_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] + model_decoder_layers_8_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[700] + alloc1294: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1293, model_decoder_layers_8_fc1_weight5, model_decoder_layers_8_fc1_bias5, alloc1294) + R.vm.kill_object(alloc1293) + R.vm.kill_object(model_decoder_layers_8_fc1_weight5) + R.vm.kill_object(model_decoder_layers_8_fc1_bias5) + model_decoder_layers_8_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] + model_decoder_layers_8_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[702] + alloc1295: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1294, model_decoder_layers_8_fc2_weight5, model_decoder_layers_8_fc2_bias5, alloc1292, alloc1295) + R.vm.kill_object(alloc1292) + R.vm.kill_object(alloc1294) + R.vm.kill_object(model_decoder_layers_8_fc2_weight5) + R.vm.kill_object(model_decoder_layers_8_fc2_bias5) + model_decoder_layers_9_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[712] + model_decoder_layers_9_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[713] + alloc1296: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1295, model_decoder_layers_9_self_attn_layer_norm_weight5, model_decoder_layers_9_self_attn_layer_norm_bias5, alloc1296) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias5) + model_decoder_layers_9_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] + model_decoder_layers_9_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[709] + alloc1297: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1296, model_decoder_layers_9_self_attn_q_proj_weight5, model_decoder_layers_9_self_attn_q_proj_bias5, alloc1297) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias5) + model_decoder_layers_9_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] + alloc1298: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1296, model_decoder_layers_9_self_attn_k_proj_weight5, alloc1298) + R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight5) + model_decoder_layers_9_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] + model_decoder_layers_9_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[707] + alloc1299: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1296, model_decoder_layers_9_self_attn_v_proj_weight5, model_decoder_layers_9_self_attn_v_proj_bias5, alloc1299) + R.vm.kill_object(alloc1296) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias5) + alloc1300: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1297, alloc1298, alloc1299, alloc1300) + R.vm.kill_object(alloc1297) + R.vm.kill_object(alloc1298) + R.vm.kill_object(alloc1299) + alloc1301: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1299: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), alloc1300, alloc1301) + R.vm.kill_object(alloc1300) + lv143: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1301, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1301) + model_decoder_layers_9_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] + model_decoder_layers_9_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[711] + alloc1302: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv143, model_decoder_layers_9_self_attn_out_proj_weight5, model_decoder_layers_9_self_attn_out_proj_bias5, alloc1295, alloc1302) + R.vm.kill_object(alloc1295) + R.vm.kill_object(lv143) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias5) + model_decoder_layers_9_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[721] + model_decoder_layers_9_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[722] + alloc1303: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1302, model_decoder_layers_9_encoder_attn_layer_norm_weight5, model_decoder_layers_9_encoder_attn_layer_norm_bias5, alloc1303) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias5) + model_decoder_layers_9_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] + model_decoder_layers_9_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[718] + alloc1304: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1303, model_decoder_layers_9_encoder_attn_q_proj_weight5, model_decoder_layers_9_encoder_attn_q_proj_bias5, alloc1304) + R.vm.kill_object(alloc1303) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias5) + lv146: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1304, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1304) + alloc1305: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1303: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), lv146, alloc1305) + R.vm.kill_object(lv146) + lv147: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1305, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1305) + model_decoder_layers_9_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] + model_decoder_layers_9_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[720] + alloc1306: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv147, model_decoder_layers_9_encoder_attn_out_proj_weight5, model_decoder_layers_9_encoder_attn_out_proj_bias5, alloc1302, alloc1306) + R.vm.kill_object(alloc1302) + R.vm.kill_object(lv147) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias5) + model_decoder_layers_9_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[727] + model_decoder_layers_9_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[728] + alloc1307: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1306, model_decoder_layers_9_final_layer_norm_weight5, model_decoder_layers_9_final_layer_norm_bias5, alloc1307) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias5) + model_decoder_layers_9_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] + model_decoder_layers_9_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[724] + alloc1308: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1307, model_decoder_layers_9_fc1_weight5, model_decoder_layers_9_fc1_bias5, alloc1308) + R.vm.kill_object(alloc1307) + R.vm.kill_object(model_decoder_layers_9_fc1_weight5) + R.vm.kill_object(model_decoder_layers_9_fc1_bias5) + model_decoder_layers_9_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] + model_decoder_layers_9_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[726] + alloc1309: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1308, model_decoder_layers_9_fc2_weight5, model_decoder_layers_9_fc2_bias5, alloc1306, alloc1309) + R.vm.kill_object(alloc1306) + R.vm.kill_object(alloc1308) + R.vm.kill_object(model_decoder_layers_9_fc2_weight5) + R.vm.kill_object(model_decoder_layers_9_fc2_bias5) + model_decoder_layers_10_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[736] + model_decoder_layers_10_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[737] + alloc1310: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1309, model_decoder_layers_10_self_attn_layer_norm_weight5, model_decoder_layers_10_self_attn_layer_norm_bias5, alloc1310) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias5) + model_decoder_layers_10_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] + model_decoder_layers_10_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[733] + alloc1311: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1310, model_decoder_layers_10_self_attn_q_proj_weight5, model_decoder_layers_10_self_attn_q_proj_bias5, alloc1311) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias5) + model_decoder_layers_10_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] + alloc1312: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1310, model_decoder_layers_10_self_attn_k_proj_weight5, alloc1312) + R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight5) + model_decoder_layers_10_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] + model_decoder_layers_10_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[731] + alloc1313: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1310, model_decoder_layers_10_self_attn_v_proj_weight5, model_decoder_layers_10_self_attn_v_proj_bias5, alloc1313) + R.vm.kill_object(alloc1310) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias5) + alloc1314: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1311, alloc1312, alloc1313, alloc1314) + R.vm.kill_object(alloc1311) + R.vm.kill_object(alloc1312) + R.vm.kill_object(alloc1313) + alloc1315: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1313: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), alloc1314, alloc1315) + R.vm.kill_object(alloc1314) + lv154: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1315, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1315) + model_decoder_layers_10_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] + model_decoder_layers_10_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[735] + alloc1316: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv154, model_decoder_layers_10_self_attn_out_proj_weight5, model_decoder_layers_10_self_attn_out_proj_bias5, alloc1309, alloc1316) + R.vm.kill_object(alloc1309) + R.vm.kill_object(lv154) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias5) + model_decoder_layers_10_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[745] + model_decoder_layers_10_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[746] + alloc1317: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1316, model_decoder_layers_10_encoder_attn_layer_norm_weight5, model_decoder_layers_10_encoder_attn_layer_norm_bias5, alloc1317) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias5) + model_decoder_layers_10_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] + model_decoder_layers_10_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[742] + alloc1318: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1317, model_decoder_layers_10_encoder_attn_q_proj_weight5, model_decoder_layers_10_encoder_attn_q_proj_bias5, alloc1318) + R.vm.kill_object(alloc1317) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias5) + lv157: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1318, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1318) + alloc1319: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1317: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), lv157, alloc1319) + R.vm.kill_object(lv157) + lv158: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1319, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1319) + model_decoder_layers_10_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] + model_decoder_layers_10_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[744] + alloc1320: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv158, model_decoder_layers_10_encoder_attn_out_proj_weight5, model_decoder_layers_10_encoder_attn_out_proj_bias5, alloc1316, alloc1320) + R.vm.kill_object(alloc1316) + R.vm.kill_object(lv158) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias5) + model_decoder_layers_10_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[751] + model_decoder_layers_10_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[752] + alloc1321: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1320, model_decoder_layers_10_final_layer_norm_weight5, model_decoder_layers_10_final_layer_norm_bias5, alloc1321) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias5) + model_decoder_layers_10_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] + model_decoder_layers_10_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[748] + alloc1322: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1321, model_decoder_layers_10_fc1_weight5, model_decoder_layers_10_fc1_bias5, alloc1322) + R.vm.kill_object(alloc1321) + R.vm.kill_object(model_decoder_layers_10_fc1_weight5) + R.vm.kill_object(model_decoder_layers_10_fc1_bias5) + model_decoder_layers_10_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] + model_decoder_layers_10_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[750] + alloc1323: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1322, model_decoder_layers_10_fc2_weight5, model_decoder_layers_10_fc2_bias5, alloc1320, alloc1323) + R.vm.kill_object(alloc1320) + R.vm.kill_object(alloc1322) + R.vm.kill_object(model_decoder_layers_10_fc2_weight5) + R.vm.kill_object(model_decoder_layers_10_fc2_bias5) + model_decoder_layers_11_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[760] + model_decoder_layers_11_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[761] + alloc1324: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1323, model_decoder_layers_11_self_attn_layer_norm_weight5, model_decoder_layers_11_self_attn_layer_norm_bias5, alloc1324) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias5) + model_decoder_layers_11_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] + model_decoder_layers_11_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[757] + alloc1325: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1324, model_decoder_layers_11_self_attn_q_proj_weight5, model_decoder_layers_11_self_attn_q_proj_bias5, alloc1325) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias5) + model_decoder_layers_11_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] + alloc1326: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1324, model_decoder_layers_11_self_attn_k_proj_weight5, alloc1326) + R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight5) + model_decoder_layers_11_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] + model_decoder_layers_11_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[755] + alloc1327: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1324, model_decoder_layers_11_self_attn_v_proj_weight5, model_decoder_layers_11_self_attn_v_proj_bias5, alloc1327) + R.vm.kill_object(alloc1324) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias5) + alloc1328: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1325, alloc1326, alloc1327, alloc1328) + R.vm.kill_object(alloc1325) + R.vm.kill_object(alloc1326) + R.vm.kill_object(alloc1327) + alloc1329: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1327: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), alloc1328, alloc1329) + R.vm.kill_object(alloc1328) + lv165: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1329, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1329) + model_decoder_layers_11_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] + model_decoder_layers_11_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[759] + alloc1330: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv165, model_decoder_layers_11_self_attn_out_proj_weight5, model_decoder_layers_11_self_attn_out_proj_bias5, alloc1323, alloc1330) + R.vm.kill_object(alloc1323) + R.vm.kill_object(lv165) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias5) + model_decoder_layers_11_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[769] + model_decoder_layers_11_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[770] + alloc1331: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1330, model_decoder_layers_11_encoder_attn_layer_norm_weight5, model_decoder_layers_11_encoder_attn_layer_norm_bias5, alloc1331) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias5) + model_decoder_layers_11_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] + model_decoder_layers_11_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[766] + alloc1332: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1331, model_decoder_layers_11_encoder_attn_q_proj_weight5, model_decoder_layers_11_encoder_attn_q_proj_bias5, alloc1332) + R.vm.kill_object(alloc1331) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias5) + lv168: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1332, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1332) + alloc1333: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1331: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), lv168, alloc1333) + R.vm.kill_object(lv168) + lv169: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1333, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1333) + model_decoder_layers_11_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] + model_decoder_layers_11_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[768] + alloc1334: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv169, model_decoder_layers_11_encoder_attn_out_proj_weight5, model_decoder_layers_11_encoder_attn_out_proj_bias5, alloc1330, alloc1334) + R.vm.kill_object(alloc1330) + R.vm.kill_object(lv169) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias5) + model_decoder_layers_11_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[775] + model_decoder_layers_11_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[776] + alloc1335: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1334, model_decoder_layers_11_final_layer_norm_weight5, model_decoder_layers_11_final_layer_norm_bias5, alloc1335) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias5) + model_decoder_layers_11_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] + model_decoder_layers_11_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[772] + alloc1336: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1335, model_decoder_layers_11_fc1_weight5, model_decoder_layers_11_fc1_bias5, alloc1336) + R.vm.kill_object(alloc1335) + R.vm.kill_object(model_decoder_layers_11_fc1_weight5) + R.vm.kill_object(model_decoder_layers_11_fc1_bias5) + model_decoder_layers_11_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] + model_decoder_layers_11_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[774] + alloc1337: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1336, model_decoder_layers_11_fc2_weight5, model_decoder_layers_11_fc2_bias5, alloc1334, alloc1337) + R.vm.kill_object(alloc1334) + R.vm.kill_object(alloc1336) + R.vm.kill_object(model_decoder_layers_11_fc2_weight5) + R.vm.kill_object(model_decoder_layers_11_fc2_bias5) + model_decoder_layers_12_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[784] + model_decoder_layers_12_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[785] + alloc1338: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1337, model_decoder_layers_12_self_attn_layer_norm_weight5, model_decoder_layers_12_self_attn_layer_norm_bias5, alloc1338) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias5) + model_decoder_layers_12_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] + model_decoder_layers_12_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[781] + alloc1339: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1338, model_decoder_layers_12_self_attn_q_proj_weight5, model_decoder_layers_12_self_attn_q_proj_bias5, alloc1339) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias5) + model_decoder_layers_12_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] + alloc1340: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1338, model_decoder_layers_12_self_attn_k_proj_weight5, alloc1340) + R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight5) + model_decoder_layers_12_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] + model_decoder_layers_12_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[779] + alloc1341: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1338, model_decoder_layers_12_self_attn_v_proj_weight5, model_decoder_layers_12_self_attn_v_proj_bias5, alloc1341) + R.vm.kill_object(alloc1338) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias5) + alloc1342: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1339, alloc1340, alloc1341, alloc1342) + R.vm.kill_object(alloc1339) + R.vm.kill_object(alloc1340) + R.vm.kill_object(alloc1341) + alloc1343: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1341: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), alloc1342, alloc1343) + R.vm.kill_object(alloc1342) + lv176: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1343, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1343) + model_decoder_layers_12_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] + model_decoder_layers_12_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[783] + alloc1344: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv176, model_decoder_layers_12_self_attn_out_proj_weight5, model_decoder_layers_12_self_attn_out_proj_bias5, alloc1337, alloc1344) + R.vm.kill_object(alloc1337) + R.vm.kill_object(lv176) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias5) + model_decoder_layers_12_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[793] + model_decoder_layers_12_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[794] + alloc1345: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1344, model_decoder_layers_12_encoder_attn_layer_norm_weight5, model_decoder_layers_12_encoder_attn_layer_norm_bias5, alloc1345) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias5) + model_decoder_layers_12_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] + model_decoder_layers_12_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[790] + alloc1346: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1345, model_decoder_layers_12_encoder_attn_q_proj_weight5, model_decoder_layers_12_encoder_attn_q_proj_bias5, alloc1346) + R.vm.kill_object(alloc1345) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias5) + lv179: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1346, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1346) + alloc1347: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1345: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), lv179, alloc1347) + R.vm.kill_object(lv179) + lv180: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1347, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1347) + model_decoder_layers_12_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] + model_decoder_layers_12_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[792] + alloc1348: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv180, model_decoder_layers_12_encoder_attn_out_proj_weight5, model_decoder_layers_12_encoder_attn_out_proj_bias5, alloc1344, alloc1348) + R.vm.kill_object(alloc1344) + R.vm.kill_object(lv180) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias5) + model_decoder_layers_12_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[799] + model_decoder_layers_12_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[800] + alloc1349: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1348, model_decoder_layers_12_final_layer_norm_weight5, model_decoder_layers_12_final_layer_norm_bias5, alloc1349) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias5) + model_decoder_layers_12_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] + model_decoder_layers_12_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[796] + alloc1350: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1349, model_decoder_layers_12_fc1_weight5, model_decoder_layers_12_fc1_bias5, alloc1350) + R.vm.kill_object(alloc1349) + R.vm.kill_object(model_decoder_layers_12_fc1_weight5) + R.vm.kill_object(model_decoder_layers_12_fc1_bias5) + model_decoder_layers_12_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] + model_decoder_layers_12_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[798] + alloc1351: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1350, model_decoder_layers_12_fc2_weight5, model_decoder_layers_12_fc2_bias5, alloc1348, alloc1351) + R.vm.kill_object(alloc1348) + R.vm.kill_object(alloc1350) + R.vm.kill_object(model_decoder_layers_12_fc2_weight5) + R.vm.kill_object(model_decoder_layers_12_fc2_bias5) + model_decoder_layers_13_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[808] + model_decoder_layers_13_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[809] + alloc1352: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1351, model_decoder_layers_13_self_attn_layer_norm_weight5, model_decoder_layers_13_self_attn_layer_norm_bias5, alloc1352) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias5) + model_decoder_layers_13_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] + model_decoder_layers_13_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[805] + alloc1353: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1352, model_decoder_layers_13_self_attn_q_proj_weight5, model_decoder_layers_13_self_attn_q_proj_bias5, alloc1353) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias5) + model_decoder_layers_13_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] + alloc1354: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1352, model_decoder_layers_13_self_attn_k_proj_weight5, alloc1354) + R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight5) + model_decoder_layers_13_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] + model_decoder_layers_13_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[803] + alloc1355: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1352, model_decoder_layers_13_self_attn_v_proj_weight5, model_decoder_layers_13_self_attn_v_proj_bias5, alloc1355) + R.vm.kill_object(alloc1352) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias5) + alloc1356: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1353, alloc1354, alloc1355, alloc1356) + R.vm.kill_object(alloc1353) + R.vm.kill_object(alloc1354) + R.vm.kill_object(alloc1355) + alloc1357: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1355: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), alloc1356, alloc1357) + R.vm.kill_object(alloc1356) + lv187: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1357, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1357) + model_decoder_layers_13_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] + model_decoder_layers_13_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[807] + alloc1358: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv187, model_decoder_layers_13_self_attn_out_proj_weight5, model_decoder_layers_13_self_attn_out_proj_bias5, alloc1351, alloc1358) + R.vm.kill_object(alloc1351) + R.vm.kill_object(lv187) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias5) + model_decoder_layers_13_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[817] + model_decoder_layers_13_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[818] + alloc1359: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1358, model_decoder_layers_13_encoder_attn_layer_norm_weight5, model_decoder_layers_13_encoder_attn_layer_norm_bias5, alloc1359) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias5) + model_decoder_layers_13_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] + model_decoder_layers_13_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[814] + alloc1360: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1359, model_decoder_layers_13_encoder_attn_q_proj_weight5, model_decoder_layers_13_encoder_attn_q_proj_bias5, alloc1360) + R.vm.kill_object(alloc1359) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias5) + lv190: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1360, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1360) + alloc1361: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1359: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), lv190, alloc1361) + R.vm.kill_object(lv190) + lv191: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1361, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1361) + model_decoder_layers_13_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] + model_decoder_layers_13_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[816] + alloc1362: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv191, model_decoder_layers_13_encoder_attn_out_proj_weight5, model_decoder_layers_13_encoder_attn_out_proj_bias5, alloc1358, alloc1362) + R.vm.kill_object(alloc1358) + R.vm.kill_object(lv191) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias5) + model_decoder_layers_13_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[823] + model_decoder_layers_13_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[824] + alloc1363: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1362, model_decoder_layers_13_final_layer_norm_weight5, model_decoder_layers_13_final_layer_norm_bias5, alloc1363) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias5) + model_decoder_layers_13_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] + model_decoder_layers_13_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[820] + alloc1364: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1363, model_decoder_layers_13_fc1_weight5, model_decoder_layers_13_fc1_bias5, alloc1364) + R.vm.kill_object(alloc1363) + R.vm.kill_object(model_decoder_layers_13_fc1_weight5) + R.vm.kill_object(model_decoder_layers_13_fc1_bias5) + model_decoder_layers_13_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] + model_decoder_layers_13_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[822] + alloc1365: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1364, model_decoder_layers_13_fc2_weight5, model_decoder_layers_13_fc2_bias5, alloc1362, alloc1365) + R.vm.kill_object(alloc1362) + R.vm.kill_object(alloc1364) + R.vm.kill_object(model_decoder_layers_13_fc2_weight5) + R.vm.kill_object(model_decoder_layers_13_fc2_bias5) + model_decoder_layers_14_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[832] + model_decoder_layers_14_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[833] + alloc1366: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1365, model_decoder_layers_14_self_attn_layer_norm_weight5, model_decoder_layers_14_self_attn_layer_norm_bias5, alloc1366) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias5) + model_decoder_layers_14_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] + model_decoder_layers_14_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[829] + alloc1367: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1366, model_decoder_layers_14_self_attn_q_proj_weight5, model_decoder_layers_14_self_attn_q_proj_bias5, alloc1367) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias5) + model_decoder_layers_14_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] + alloc1368: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1366, model_decoder_layers_14_self_attn_k_proj_weight5, alloc1368) + R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight5) + model_decoder_layers_14_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] + model_decoder_layers_14_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[827] + alloc1369: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1366, model_decoder_layers_14_self_attn_v_proj_weight5, model_decoder_layers_14_self_attn_v_proj_bias5, alloc1369) + R.vm.kill_object(alloc1366) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias5) + alloc1370: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1367, alloc1368, alloc1369, alloc1370) + R.vm.kill_object(alloc1367) + R.vm.kill_object(alloc1368) + R.vm.kill_object(alloc1369) + alloc1371: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1369: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), alloc1370, alloc1371) + R.vm.kill_object(alloc1370) + lv198: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1371, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1371) + model_decoder_layers_14_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] + model_decoder_layers_14_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[831] + alloc1372: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv198, model_decoder_layers_14_self_attn_out_proj_weight5, model_decoder_layers_14_self_attn_out_proj_bias5, alloc1365, alloc1372) + R.vm.kill_object(alloc1365) + R.vm.kill_object(lv198) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias5) + model_decoder_layers_14_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[841] + model_decoder_layers_14_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[842] + alloc1373: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1372, model_decoder_layers_14_encoder_attn_layer_norm_weight5, model_decoder_layers_14_encoder_attn_layer_norm_bias5, alloc1373) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias5) + model_decoder_layers_14_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] + model_decoder_layers_14_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[838] + alloc1374: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1373, model_decoder_layers_14_encoder_attn_q_proj_weight5, model_decoder_layers_14_encoder_attn_q_proj_bias5, alloc1374) + R.vm.kill_object(alloc1373) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias5) + lv201: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1374, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1374) + alloc1375: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1373: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), lv201, alloc1375) + R.vm.kill_object(lv201) + lv202: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1375, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1375) + model_decoder_layers_14_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] + model_decoder_layers_14_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[840] + alloc1376: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv202, model_decoder_layers_14_encoder_attn_out_proj_weight5, model_decoder_layers_14_encoder_attn_out_proj_bias5, alloc1372, alloc1376) + R.vm.kill_object(alloc1372) + R.vm.kill_object(lv202) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias5) + model_decoder_layers_14_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[847] + model_decoder_layers_14_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[848] + alloc1377: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1376, model_decoder_layers_14_final_layer_norm_weight5, model_decoder_layers_14_final_layer_norm_bias5, alloc1377) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias5) + model_decoder_layers_14_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] + model_decoder_layers_14_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[844] + alloc1378: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1377, model_decoder_layers_14_fc1_weight5, model_decoder_layers_14_fc1_bias5, alloc1378) + R.vm.kill_object(alloc1377) + R.vm.kill_object(model_decoder_layers_14_fc1_weight5) + R.vm.kill_object(model_decoder_layers_14_fc1_bias5) + model_decoder_layers_14_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] + model_decoder_layers_14_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[846] + alloc1379: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1378, model_decoder_layers_14_fc2_weight5, model_decoder_layers_14_fc2_bias5, alloc1376, alloc1379) + R.vm.kill_object(alloc1376) + R.vm.kill_object(alloc1378) + R.vm.kill_object(model_decoder_layers_14_fc2_weight5) + R.vm.kill_object(model_decoder_layers_14_fc2_bias5) + model_decoder_layers_15_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[856] + model_decoder_layers_15_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[857] + alloc1380: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1379, model_decoder_layers_15_self_attn_layer_norm_weight5, model_decoder_layers_15_self_attn_layer_norm_bias5, alloc1380) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias5) + model_decoder_layers_15_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] + model_decoder_layers_15_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[853] + alloc1381: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1380, model_decoder_layers_15_self_attn_q_proj_weight5, model_decoder_layers_15_self_attn_q_proj_bias5, alloc1381) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias5) + model_decoder_layers_15_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] + alloc1382: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1380, model_decoder_layers_15_self_attn_k_proj_weight5, alloc1382) + R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight5) + model_decoder_layers_15_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] + model_decoder_layers_15_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[851] + alloc1383: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1380, model_decoder_layers_15_self_attn_v_proj_weight5, model_decoder_layers_15_self_attn_v_proj_bias5, alloc1383) + R.vm.kill_object(alloc1380) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias5) + alloc1384: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1381, alloc1382, alloc1383, alloc1384) + R.vm.kill_object(alloc1381) + R.vm.kill_object(alloc1382) + R.vm.kill_object(alloc1383) + alloc1385: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1383: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), alloc1384, alloc1385) + R.vm.kill_object(alloc1384) + lv209: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1385, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1385) + model_decoder_layers_15_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] + model_decoder_layers_15_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[855] + alloc1386: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv209, model_decoder_layers_15_self_attn_out_proj_weight5, model_decoder_layers_15_self_attn_out_proj_bias5, alloc1379, alloc1386) + R.vm.kill_object(alloc1379) + R.vm.kill_object(lv209) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias5) + model_decoder_layers_15_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[865] + model_decoder_layers_15_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[866] + alloc1387: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1386, model_decoder_layers_15_encoder_attn_layer_norm_weight5, model_decoder_layers_15_encoder_attn_layer_norm_bias5, alloc1387) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias5) + model_decoder_layers_15_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] + model_decoder_layers_15_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[862] + alloc1388: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1387, model_decoder_layers_15_encoder_attn_q_proj_weight5, model_decoder_layers_15_encoder_attn_q_proj_bias5, alloc1388) + R.vm.kill_object(alloc1387) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias5) + lv212: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1388, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1388) + alloc1389: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1387: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), lv212, alloc1389) + R.vm.kill_object(lv212) + lv213: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1389, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1389) + model_decoder_layers_15_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] + model_decoder_layers_15_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[864] + alloc1390: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv213, model_decoder_layers_15_encoder_attn_out_proj_weight5, model_decoder_layers_15_encoder_attn_out_proj_bias5, alloc1386, alloc1390) + R.vm.kill_object(alloc1386) + R.vm.kill_object(lv213) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias5) + model_decoder_layers_15_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[871] + model_decoder_layers_15_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[872] + alloc1391: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1390, model_decoder_layers_15_final_layer_norm_weight5, model_decoder_layers_15_final_layer_norm_bias5, alloc1391) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias5) + model_decoder_layers_15_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] + model_decoder_layers_15_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[868] + alloc1392: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1391, model_decoder_layers_15_fc1_weight5, model_decoder_layers_15_fc1_bias5, alloc1392) + R.vm.kill_object(alloc1391) + R.vm.kill_object(model_decoder_layers_15_fc1_weight5) + R.vm.kill_object(model_decoder_layers_15_fc1_bias5) + model_decoder_layers_15_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] + model_decoder_layers_15_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[870] + alloc1393: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1392, model_decoder_layers_15_fc2_weight5, model_decoder_layers_15_fc2_bias5, alloc1390, alloc1393) + R.vm.kill_object(alloc1390) + R.vm.kill_object(alloc1392) + R.vm.kill_object(model_decoder_layers_15_fc2_weight5) + R.vm.kill_object(model_decoder_layers_15_fc2_bias5) + model_decoder_layers_16_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[880] + model_decoder_layers_16_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[881] + alloc1394: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1393, model_decoder_layers_16_self_attn_layer_norm_weight5, model_decoder_layers_16_self_attn_layer_norm_bias5, alloc1394) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias5) + model_decoder_layers_16_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] + model_decoder_layers_16_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[877] + alloc1395: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1394, model_decoder_layers_16_self_attn_q_proj_weight5, model_decoder_layers_16_self_attn_q_proj_bias5, alloc1395) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias5) + model_decoder_layers_16_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] + alloc1396: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1394, model_decoder_layers_16_self_attn_k_proj_weight5, alloc1396) + R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight5) + model_decoder_layers_16_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] + model_decoder_layers_16_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[875] + alloc1397: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1394, model_decoder_layers_16_self_attn_v_proj_weight5, model_decoder_layers_16_self_attn_v_proj_bias5, alloc1397) + R.vm.kill_object(alloc1394) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias5) + alloc1398: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1395, alloc1396, alloc1397, alloc1398) + R.vm.kill_object(alloc1395) + R.vm.kill_object(alloc1396) + R.vm.kill_object(alloc1397) + alloc1399: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1397: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), alloc1398, alloc1399) + R.vm.kill_object(alloc1398) + lv220: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1399, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1399) + model_decoder_layers_16_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] + model_decoder_layers_16_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[879] + alloc1400: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv220, model_decoder_layers_16_self_attn_out_proj_weight5, model_decoder_layers_16_self_attn_out_proj_bias5, alloc1393, alloc1400) + R.vm.kill_object(alloc1393) + R.vm.kill_object(lv220) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias5) + model_decoder_layers_16_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[889] + model_decoder_layers_16_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[890] + alloc1401: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1400, model_decoder_layers_16_encoder_attn_layer_norm_weight5, model_decoder_layers_16_encoder_attn_layer_norm_bias5, alloc1401) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias5) + model_decoder_layers_16_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] + model_decoder_layers_16_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[886] + alloc1402: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1401, model_decoder_layers_16_encoder_attn_q_proj_weight5, model_decoder_layers_16_encoder_attn_q_proj_bias5, alloc1402) + R.vm.kill_object(alloc1401) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias5) + lv223: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1402, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1402) + alloc1403: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1401: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), lv223, alloc1403) + R.vm.kill_object(lv223) + lv224: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1403, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1403) + model_decoder_layers_16_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] + model_decoder_layers_16_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[888] + alloc1404: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv224, model_decoder_layers_16_encoder_attn_out_proj_weight5, model_decoder_layers_16_encoder_attn_out_proj_bias5, alloc1400, alloc1404) + R.vm.kill_object(alloc1400) + R.vm.kill_object(lv224) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias5) + model_decoder_layers_16_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[895] + model_decoder_layers_16_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[896] + alloc1405: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1404, model_decoder_layers_16_final_layer_norm_weight5, model_decoder_layers_16_final_layer_norm_bias5, alloc1405) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias5) + model_decoder_layers_16_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] + model_decoder_layers_16_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[892] + alloc1406: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1405, model_decoder_layers_16_fc1_weight5, model_decoder_layers_16_fc1_bias5, alloc1406) + R.vm.kill_object(alloc1405) + R.vm.kill_object(model_decoder_layers_16_fc1_weight5) + R.vm.kill_object(model_decoder_layers_16_fc1_bias5) + model_decoder_layers_16_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] + model_decoder_layers_16_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[894] + alloc1407: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1406, model_decoder_layers_16_fc2_weight5, model_decoder_layers_16_fc2_bias5, alloc1404, alloc1407) + R.vm.kill_object(alloc1404) + R.vm.kill_object(alloc1406) + R.vm.kill_object(model_decoder_layers_16_fc2_weight5) + R.vm.kill_object(model_decoder_layers_16_fc2_bias5) + model_decoder_layers_17_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[904] + model_decoder_layers_17_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[905] + alloc1408: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1407, model_decoder_layers_17_self_attn_layer_norm_weight5, model_decoder_layers_17_self_attn_layer_norm_bias5, alloc1408) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias5) + model_decoder_layers_17_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] + model_decoder_layers_17_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[901] + alloc1409: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1408, model_decoder_layers_17_self_attn_q_proj_weight5, model_decoder_layers_17_self_attn_q_proj_bias5, alloc1409) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias5) + model_decoder_layers_17_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] + alloc1410: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1408, model_decoder_layers_17_self_attn_k_proj_weight5, alloc1410) + R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight5) + model_decoder_layers_17_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] + model_decoder_layers_17_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[899] + alloc1411: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1408, model_decoder_layers_17_self_attn_v_proj_weight5, model_decoder_layers_17_self_attn_v_proj_bias5, alloc1411) + R.vm.kill_object(alloc1408) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias5) + alloc1412: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1409, alloc1410, alloc1411, alloc1412) + R.vm.kill_object(alloc1409) + R.vm.kill_object(alloc1410) + R.vm.kill_object(alloc1411) + alloc1413: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1411: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), alloc1412, alloc1413) + R.vm.kill_object(alloc1412) + lv231: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1413, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1413) + model_decoder_layers_17_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] + model_decoder_layers_17_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[903] + alloc1414: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv231, model_decoder_layers_17_self_attn_out_proj_weight5, model_decoder_layers_17_self_attn_out_proj_bias5, alloc1407, alloc1414) + R.vm.kill_object(alloc1407) + R.vm.kill_object(lv231) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias5) + model_decoder_layers_17_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[913] + model_decoder_layers_17_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[914] + alloc1415: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1414, model_decoder_layers_17_encoder_attn_layer_norm_weight5, model_decoder_layers_17_encoder_attn_layer_norm_bias5, alloc1415) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias5) + model_decoder_layers_17_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] + model_decoder_layers_17_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[910] + alloc1416: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1415, model_decoder_layers_17_encoder_attn_q_proj_weight5, model_decoder_layers_17_encoder_attn_q_proj_bias5, alloc1416) + R.vm.kill_object(alloc1415) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias5) + lv234: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1416, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1416) + alloc1417: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1415: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), lv234, alloc1417) + R.vm.kill_object(lv234) + lv235: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1417, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1417) + model_decoder_layers_17_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] + model_decoder_layers_17_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[912] + alloc1418: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv235, model_decoder_layers_17_encoder_attn_out_proj_weight5, model_decoder_layers_17_encoder_attn_out_proj_bias5, alloc1414, alloc1418) + R.vm.kill_object(alloc1414) + R.vm.kill_object(lv235) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias5) + model_decoder_layers_17_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[919] + model_decoder_layers_17_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[920] + alloc1419: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1418, model_decoder_layers_17_final_layer_norm_weight5, model_decoder_layers_17_final_layer_norm_bias5, alloc1419) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias5) + model_decoder_layers_17_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] + model_decoder_layers_17_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[916] + alloc1420: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1419, model_decoder_layers_17_fc1_weight5, model_decoder_layers_17_fc1_bias5, alloc1420) + R.vm.kill_object(alloc1419) + R.vm.kill_object(model_decoder_layers_17_fc1_weight5) + R.vm.kill_object(model_decoder_layers_17_fc1_bias5) + model_decoder_layers_17_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] + model_decoder_layers_17_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[918] + alloc1421: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1420, model_decoder_layers_17_fc2_weight5, model_decoder_layers_17_fc2_bias5, alloc1418, alloc1421) + R.vm.kill_object(alloc1418) + R.vm.kill_object(alloc1420) + R.vm.kill_object(model_decoder_layers_17_fc2_weight5) + R.vm.kill_object(model_decoder_layers_17_fc2_bias5) + model_decoder_layers_18_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[928] + model_decoder_layers_18_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[929] + alloc1422: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1421, model_decoder_layers_18_self_attn_layer_norm_weight5, model_decoder_layers_18_self_attn_layer_norm_bias5, alloc1422) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias5) + model_decoder_layers_18_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] + model_decoder_layers_18_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[925] + alloc1423: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1422, model_decoder_layers_18_self_attn_q_proj_weight5, model_decoder_layers_18_self_attn_q_proj_bias5, alloc1423) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias5) + model_decoder_layers_18_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] + alloc1424: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1422, model_decoder_layers_18_self_attn_k_proj_weight5, alloc1424) + R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight5) + model_decoder_layers_18_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] + model_decoder_layers_18_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[923] + alloc1425: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1422, model_decoder_layers_18_self_attn_v_proj_weight5, model_decoder_layers_18_self_attn_v_proj_bias5, alloc1425) + R.vm.kill_object(alloc1422) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias5) + alloc1426: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1423, alloc1424, alloc1425, alloc1426) + R.vm.kill_object(alloc1423) + R.vm.kill_object(alloc1424) + R.vm.kill_object(alloc1425) + alloc1427: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1425: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), alloc1426, alloc1427) + R.vm.kill_object(alloc1426) + lv242: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1427, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1427) + model_decoder_layers_18_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] + model_decoder_layers_18_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[927] + alloc1428: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv242, model_decoder_layers_18_self_attn_out_proj_weight5, model_decoder_layers_18_self_attn_out_proj_bias5, alloc1421, alloc1428) + R.vm.kill_object(alloc1421) + R.vm.kill_object(lv242) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias5) + model_decoder_layers_18_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[937] + model_decoder_layers_18_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[938] + alloc1429: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1428, model_decoder_layers_18_encoder_attn_layer_norm_weight5, model_decoder_layers_18_encoder_attn_layer_norm_bias5, alloc1429) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias5) + model_decoder_layers_18_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] + model_decoder_layers_18_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[934] + alloc1430: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1429, model_decoder_layers_18_encoder_attn_q_proj_weight5, model_decoder_layers_18_encoder_attn_q_proj_bias5, alloc1430) + R.vm.kill_object(alloc1429) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias5) + lv245: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1430, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1430) + alloc1431: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1429: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), lv245, alloc1431) + R.vm.kill_object(lv245) + lv246: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1431, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1431) + model_decoder_layers_18_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] + model_decoder_layers_18_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[936] + alloc1432: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv246, model_decoder_layers_18_encoder_attn_out_proj_weight5, model_decoder_layers_18_encoder_attn_out_proj_bias5, alloc1428, alloc1432) + R.vm.kill_object(alloc1428) + R.vm.kill_object(lv246) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias5) + model_decoder_layers_18_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[943] + model_decoder_layers_18_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[944] + alloc1433: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1432, model_decoder_layers_18_final_layer_norm_weight5, model_decoder_layers_18_final_layer_norm_bias5, alloc1433) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias5) + model_decoder_layers_18_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] + model_decoder_layers_18_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[940] + alloc1434: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1433, model_decoder_layers_18_fc1_weight5, model_decoder_layers_18_fc1_bias5, alloc1434) + R.vm.kill_object(alloc1433) + R.vm.kill_object(model_decoder_layers_18_fc1_weight5) + R.vm.kill_object(model_decoder_layers_18_fc1_bias5) + model_decoder_layers_18_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] + model_decoder_layers_18_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[942] + alloc1435: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1434, model_decoder_layers_18_fc2_weight5, model_decoder_layers_18_fc2_bias5, alloc1432, alloc1435) + R.vm.kill_object(alloc1432) + R.vm.kill_object(alloc1434) + R.vm.kill_object(model_decoder_layers_18_fc2_weight5) + R.vm.kill_object(model_decoder_layers_18_fc2_bias5) + model_decoder_layers_19_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[952] + model_decoder_layers_19_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[953] + alloc1436: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1435, model_decoder_layers_19_self_attn_layer_norm_weight5, model_decoder_layers_19_self_attn_layer_norm_bias5, alloc1436) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias5) + model_decoder_layers_19_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] + model_decoder_layers_19_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[949] + alloc1437: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1436, model_decoder_layers_19_self_attn_q_proj_weight5, model_decoder_layers_19_self_attn_q_proj_bias5, alloc1437) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias5) + model_decoder_layers_19_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] + alloc1438: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1436, model_decoder_layers_19_self_attn_k_proj_weight5, alloc1438) + R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight5) + model_decoder_layers_19_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] + model_decoder_layers_19_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[947] + alloc1439: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1436, model_decoder_layers_19_self_attn_v_proj_weight5, model_decoder_layers_19_self_attn_v_proj_bias5, alloc1439) + R.vm.kill_object(alloc1436) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias5) + alloc1440: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1437, alloc1438, alloc1439, alloc1440) + R.vm.kill_object(alloc1437) + R.vm.kill_object(alloc1438) + R.vm.kill_object(alloc1439) + alloc1441: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1439: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), alloc1440, alloc1441) + R.vm.kill_object(alloc1440) + lv253: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1441, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1441) + model_decoder_layers_19_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] + model_decoder_layers_19_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[951] + alloc1442: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv253, model_decoder_layers_19_self_attn_out_proj_weight5, model_decoder_layers_19_self_attn_out_proj_bias5, alloc1435, alloc1442) + R.vm.kill_object(alloc1435) + R.vm.kill_object(lv253) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias5) + model_decoder_layers_19_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[961] + model_decoder_layers_19_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[962] + alloc1443: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1442, model_decoder_layers_19_encoder_attn_layer_norm_weight5, model_decoder_layers_19_encoder_attn_layer_norm_bias5, alloc1443) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias5) + model_decoder_layers_19_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] + model_decoder_layers_19_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[958] + alloc1444: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1443, model_decoder_layers_19_encoder_attn_q_proj_weight5, model_decoder_layers_19_encoder_attn_q_proj_bias5, alloc1444) + R.vm.kill_object(alloc1443) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias5) + lv256: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1444, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1444) + alloc1445: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1443: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), lv256, alloc1445) + R.vm.kill_object(lv256) + lv257: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1445, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1445) + model_decoder_layers_19_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] + model_decoder_layers_19_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[960] + alloc1446: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv257, model_decoder_layers_19_encoder_attn_out_proj_weight5, model_decoder_layers_19_encoder_attn_out_proj_bias5, alloc1442, alloc1446) + R.vm.kill_object(alloc1442) + R.vm.kill_object(lv257) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias5) + model_decoder_layers_19_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[967] + model_decoder_layers_19_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[968] + alloc1447: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1446, model_decoder_layers_19_final_layer_norm_weight5, model_decoder_layers_19_final_layer_norm_bias5, alloc1447) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias5) + model_decoder_layers_19_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] + model_decoder_layers_19_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[964] + alloc1448: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1447, model_decoder_layers_19_fc1_weight5, model_decoder_layers_19_fc1_bias5, alloc1448) + R.vm.kill_object(alloc1447) + R.vm.kill_object(model_decoder_layers_19_fc1_weight5) + R.vm.kill_object(model_decoder_layers_19_fc1_bias5) + model_decoder_layers_19_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] + model_decoder_layers_19_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[966] + alloc1449: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1448, model_decoder_layers_19_fc2_weight5, model_decoder_layers_19_fc2_bias5, alloc1446, alloc1449) + R.vm.kill_object(alloc1446) + R.vm.kill_object(alloc1448) + R.vm.kill_object(model_decoder_layers_19_fc2_weight5) + R.vm.kill_object(model_decoder_layers_19_fc2_bias5) + model_decoder_layers_20_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[976] + model_decoder_layers_20_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[977] + alloc1450: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1449, model_decoder_layers_20_self_attn_layer_norm_weight5, model_decoder_layers_20_self_attn_layer_norm_bias5, alloc1450) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias5) + model_decoder_layers_20_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] + model_decoder_layers_20_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[973] + alloc1451: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1450, model_decoder_layers_20_self_attn_q_proj_weight5, model_decoder_layers_20_self_attn_q_proj_bias5, alloc1451) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias5) + model_decoder_layers_20_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] + alloc1452: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1450, model_decoder_layers_20_self_attn_k_proj_weight5, alloc1452) + R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight5) + model_decoder_layers_20_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] + model_decoder_layers_20_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[971] + alloc1453: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1450, model_decoder_layers_20_self_attn_v_proj_weight5, model_decoder_layers_20_self_attn_v_proj_bias5, alloc1453) + R.vm.kill_object(alloc1450) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias5) + alloc1454: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1451, alloc1452, alloc1453, alloc1454) + R.vm.kill_object(alloc1451) + R.vm.kill_object(alloc1452) + R.vm.kill_object(alloc1453) + alloc1455: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1453: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), alloc1454, alloc1455) + R.vm.kill_object(alloc1454) + lv264_1: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1455, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1455) + model_decoder_layers_20_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] + model_decoder_layers_20_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[975] + alloc1456: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv264_1, model_decoder_layers_20_self_attn_out_proj_weight5, model_decoder_layers_20_self_attn_out_proj_bias5, alloc1449, alloc1456) + R.vm.kill_object(alloc1449) + R.vm.kill_object(lv264_1) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias5) + model_decoder_layers_20_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[985] + model_decoder_layers_20_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[986] + alloc1457: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1456, model_decoder_layers_20_encoder_attn_layer_norm_weight5, model_decoder_layers_20_encoder_attn_layer_norm_bias5, alloc1457) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias5) + model_decoder_layers_20_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] + model_decoder_layers_20_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[982] + alloc1458: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1457, model_decoder_layers_20_encoder_attn_q_proj_weight5, model_decoder_layers_20_encoder_attn_q_proj_bias5, alloc1458) + R.vm.kill_object(alloc1457) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias5) + lv267: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1458, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1458) + alloc1459: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1457: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), lv267, alloc1459) + R.vm.kill_object(lv267) + lv268: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1459, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1459) + model_decoder_layers_20_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] + model_decoder_layers_20_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[984] + alloc1460: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv268, model_decoder_layers_20_encoder_attn_out_proj_weight5, model_decoder_layers_20_encoder_attn_out_proj_bias5, alloc1456, alloc1460) + R.vm.kill_object(alloc1456) + R.vm.kill_object(lv268) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias5) + model_decoder_layers_20_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[991] + model_decoder_layers_20_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[992] + alloc1461: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1460, model_decoder_layers_20_final_layer_norm_weight5, model_decoder_layers_20_final_layer_norm_bias5, alloc1461) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias5) + model_decoder_layers_20_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] + model_decoder_layers_20_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[988] + alloc1462: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1461, model_decoder_layers_20_fc1_weight5, model_decoder_layers_20_fc1_bias5, alloc1462) + R.vm.kill_object(alloc1461) + R.vm.kill_object(model_decoder_layers_20_fc1_weight5) + R.vm.kill_object(model_decoder_layers_20_fc1_bias5) + model_decoder_layers_20_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] + model_decoder_layers_20_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[990] + alloc1463: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1462, model_decoder_layers_20_fc2_weight5, model_decoder_layers_20_fc2_bias5, alloc1460, alloc1463) + R.vm.kill_object(alloc1460) + R.vm.kill_object(alloc1462) + R.vm.kill_object(model_decoder_layers_20_fc2_weight5) + R.vm.kill_object(model_decoder_layers_20_fc2_bias5) + model_decoder_layers_21_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1000] + model_decoder_layers_21_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1001] + alloc1464: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1463, model_decoder_layers_21_self_attn_layer_norm_weight5, model_decoder_layers_21_self_attn_layer_norm_bias5, alloc1464) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias5) + model_decoder_layers_21_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] + model_decoder_layers_21_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[997] + alloc1465: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1464, model_decoder_layers_21_self_attn_q_proj_weight5, model_decoder_layers_21_self_attn_q_proj_bias5, alloc1465) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias5) + model_decoder_layers_21_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] + alloc1466: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1464, model_decoder_layers_21_self_attn_k_proj_weight5, alloc1466) + R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight5) + model_decoder_layers_21_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] + model_decoder_layers_21_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[995] + alloc1467: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1464, model_decoder_layers_21_self_attn_v_proj_weight5, model_decoder_layers_21_self_attn_v_proj_bias5, alloc1467) + R.vm.kill_object(alloc1464) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias5) + alloc1468: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1465, alloc1466, alloc1467, alloc1468) + R.vm.kill_object(alloc1465) + R.vm.kill_object(alloc1466) + R.vm.kill_object(alloc1467) + alloc1469: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1467: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), alloc1468, alloc1469) + R.vm.kill_object(alloc1468) + lv275: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1469, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1469) + model_decoder_layers_21_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] + model_decoder_layers_21_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[999] + alloc1470: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv275, model_decoder_layers_21_self_attn_out_proj_weight5, model_decoder_layers_21_self_attn_out_proj_bias5, alloc1463, alloc1470) + R.vm.kill_object(alloc1463) + R.vm.kill_object(lv275) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias5) + model_decoder_layers_21_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1009] + model_decoder_layers_21_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1010] + alloc1471: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1470, model_decoder_layers_21_encoder_attn_layer_norm_weight5, model_decoder_layers_21_encoder_attn_layer_norm_bias5, alloc1471) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias5) + model_decoder_layers_21_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] + model_decoder_layers_21_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1006] + alloc1472: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1471, model_decoder_layers_21_encoder_attn_q_proj_weight5, model_decoder_layers_21_encoder_attn_q_proj_bias5, alloc1472) + R.vm.kill_object(alloc1471) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias5) + lv278: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1472, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1472) + alloc1473: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1471: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), lv278, alloc1473) + R.vm.kill_object(lv278) + lv279: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1473, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1473) + model_decoder_layers_21_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] + model_decoder_layers_21_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1008] + alloc1474: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv279, model_decoder_layers_21_encoder_attn_out_proj_weight5, model_decoder_layers_21_encoder_attn_out_proj_bias5, alloc1470, alloc1474) + R.vm.kill_object(alloc1470) + R.vm.kill_object(lv279) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias5) + model_decoder_layers_21_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1015] + model_decoder_layers_21_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1016] + alloc1475: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1474, model_decoder_layers_21_final_layer_norm_weight5, model_decoder_layers_21_final_layer_norm_bias5, alloc1475) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias5) + model_decoder_layers_21_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] + model_decoder_layers_21_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1012] + alloc1476: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1475, model_decoder_layers_21_fc1_weight5, model_decoder_layers_21_fc1_bias5, alloc1476) + R.vm.kill_object(alloc1475) + R.vm.kill_object(model_decoder_layers_21_fc1_weight5) + R.vm.kill_object(model_decoder_layers_21_fc1_bias5) + model_decoder_layers_21_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] + model_decoder_layers_21_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1014] + alloc1477: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1476, model_decoder_layers_21_fc2_weight5, model_decoder_layers_21_fc2_bias5, alloc1474, alloc1477) + R.vm.kill_object(alloc1474) + R.vm.kill_object(alloc1476) + R.vm.kill_object(model_decoder_layers_21_fc2_weight5) + R.vm.kill_object(model_decoder_layers_21_fc2_bias5) + model_decoder_layers_22_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1024] + model_decoder_layers_22_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1025] + alloc1478: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1477, model_decoder_layers_22_self_attn_layer_norm_weight5, model_decoder_layers_22_self_attn_layer_norm_bias5, alloc1478) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias5) + model_decoder_layers_22_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] + model_decoder_layers_22_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1021] + alloc1479: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1478, model_decoder_layers_22_self_attn_q_proj_weight5, model_decoder_layers_22_self_attn_q_proj_bias5, alloc1479) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias5) + model_decoder_layers_22_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] + alloc1480: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1478, model_decoder_layers_22_self_attn_k_proj_weight5, alloc1480) + R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight5) + model_decoder_layers_22_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] + model_decoder_layers_22_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1019] + alloc1481: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1478, model_decoder_layers_22_self_attn_v_proj_weight5, model_decoder_layers_22_self_attn_v_proj_bias5, alloc1481) + R.vm.kill_object(alloc1478) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias5) + alloc1482: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1479, alloc1480, alloc1481, alloc1482) + R.vm.kill_object(alloc1479) + R.vm.kill_object(alloc1480) + R.vm.kill_object(alloc1481) + alloc1483: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1481: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), alloc1482, alloc1483) + R.vm.kill_object(alloc1482) + lv286: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1483, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1483) + model_decoder_layers_22_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] + model_decoder_layers_22_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1023] + alloc1484: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv286, model_decoder_layers_22_self_attn_out_proj_weight5, model_decoder_layers_22_self_attn_out_proj_bias5, alloc1477, alloc1484) + R.vm.kill_object(alloc1477) + R.vm.kill_object(lv286) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias5) + model_decoder_layers_22_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1033] + model_decoder_layers_22_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1034] + alloc1485: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1484, model_decoder_layers_22_encoder_attn_layer_norm_weight5, model_decoder_layers_22_encoder_attn_layer_norm_bias5, alloc1485) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias5) + model_decoder_layers_22_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] + model_decoder_layers_22_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1030] + alloc1486: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1485, model_decoder_layers_22_encoder_attn_q_proj_weight5, model_decoder_layers_22_encoder_attn_q_proj_bias5, alloc1486) + R.vm.kill_object(alloc1485) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias5) + lv289: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1486, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1486) + alloc1487: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1485: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), lv289, alloc1487) + R.vm.kill_object(lv289) + lv290: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1487, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1487) + model_decoder_layers_22_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] + model_decoder_layers_22_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1032] + alloc1488: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv290, model_decoder_layers_22_encoder_attn_out_proj_weight5, model_decoder_layers_22_encoder_attn_out_proj_bias5, alloc1484, alloc1488) + R.vm.kill_object(alloc1484) + R.vm.kill_object(lv290) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias5) + model_decoder_layers_22_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1039] + model_decoder_layers_22_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1040] + alloc1489: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1488, model_decoder_layers_22_final_layer_norm_weight5, model_decoder_layers_22_final_layer_norm_bias5, alloc1489) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias5) + model_decoder_layers_22_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] + model_decoder_layers_22_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1036] + alloc1490: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1489, model_decoder_layers_22_fc1_weight5, model_decoder_layers_22_fc1_bias5, alloc1490) + R.vm.kill_object(alloc1489) + R.vm.kill_object(model_decoder_layers_22_fc1_weight5) + R.vm.kill_object(model_decoder_layers_22_fc1_bias5) + model_decoder_layers_22_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] + model_decoder_layers_22_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1038] + alloc1491: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1490, model_decoder_layers_22_fc2_weight5, model_decoder_layers_22_fc2_bias5, alloc1488, alloc1491) + R.vm.kill_object(alloc1488) + R.vm.kill_object(alloc1490) + R.vm.kill_object(model_decoder_layers_22_fc2_weight5) + R.vm.kill_object(model_decoder_layers_22_fc2_bias5) + model_decoder_layers_23_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1048] + model_decoder_layers_23_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1049] + alloc1492: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1491, model_decoder_layers_23_self_attn_layer_norm_weight5, model_decoder_layers_23_self_attn_layer_norm_bias5, alloc1492) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias5) + model_decoder_layers_23_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] + model_decoder_layers_23_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1045] + alloc1493: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1492, model_decoder_layers_23_self_attn_q_proj_weight5, model_decoder_layers_23_self_attn_q_proj_bias5, alloc1493) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias5) + model_decoder_layers_23_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] + alloc1494: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1492, model_decoder_layers_23_self_attn_k_proj_weight5, alloc1494) + R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight5) + model_decoder_layers_23_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] + model_decoder_layers_23_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1043] + alloc1495: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1492, model_decoder_layers_23_self_attn_v_proj_weight5, model_decoder_layers_23_self_attn_v_proj_bias5, alloc1495) + R.vm.kill_object(alloc1492) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias5) + alloc1496: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1493, alloc1494, alloc1495, alloc1496) + R.vm.kill_object(alloc1493) + R.vm.kill_object(alloc1494) + R.vm.kill_object(alloc1495) + alloc1497: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1495: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), alloc1496, alloc1497) + R.vm.kill_object(alloc1496) + lv297: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1497, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1497) + model_decoder_layers_23_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] + model_decoder_layers_23_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1047] + alloc1498: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv297, model_decoder_layers_23_self_attn_out_proj_weight5, model_decoder_layers_23_self_attn_out_proj_bias5, alloc1491, alloc1498) + R.vm.kill_object(alloc1491) + R.vm.kill_object(lv297) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias5) + model_decoder_layers_23_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1057] + model_decoder_layers_23_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1058] + alloc1499: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1498, model_decoder_layers_23_encoder_attn_layer_norm_weight5, model_decoder_layers_23_encoder_attn_layer_norm_bias5, alloc1499) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias5) + model_decoder_layers_23_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] + model_decoder_layers_23_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1054] + alloc1500: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1499, model_decoder_layers_23_encoder_attn_q_proj_weight5, model_decoder_layers_23_encoder_attn_q_proj_bias5, alloc1500) + R.vm.kill_object(alloc1499) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias5) + lv300: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1500, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1500) + alloc1501: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1499: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), lv300, alloc1501) + R.vm.kill_object(lv300) + lv301: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1501, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1501) + model_decoder_layers_23_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] + model_decoder_layers_23_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1056] + alloc1502: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv301, model_decoder_layers_23_encoder_attn_out_proj_weight5, model_decoder_layers_23_encoder_attn_out_proj_bias5, alloc1498, alloc1502) + R.vm.kill_object(alloc1498) + R.vm.kill_object(lv301) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias5) + model_decoder_layers_23_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1063] + model_decoder_layers_23_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1064] + alloc1503: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1502, model_decoder_layers_23_final_layer_norm_weight5, model_decoder_layers_23_final_layer_norm_bias5, alloc1503) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias5) + model_decoder_layers_23_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] + model_decoder_layers_23_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1060] + alloc1504: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1503, model_decoder_layers_23_fc1_weight5, model_decoder_layers_23_fc1_bias5, alloc1504) + R.vm.kill_object(alloc1503) + R.vm.kill_object(model_decoder_layers_23_fc1_weight5) + R.vm.kill_object(model_decoder_layers_23_fc1_bias5) + model_decoder_layers_23_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] + model_decoder_layers_23_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1062] + alloc1505: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1504, model_decoder_layers_23_fc2_weight5, model_decoder_layers_23_fc2_bias5, alloc1502, alloc1505) + R.vm.kill_object(alloc1502) + R.vm.kill_object(alloc1504) + R.vm.kill_object(model_decoder_layers_23_fc2_weight5) + R.vm.kill_object(model_decoder_layers_23_fc2_bias5) + model_decoder_layers_24_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1072] + model_decoder_layers_24_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1073] + alloc1506: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1505, model_decoder_layers_24_self_attn_layer_norm_weight5, model_decoder_layers_24_self_attn_layer_norm_bias5, alloc1506) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias5) + model_decoder_layers_24_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] + model_decoder_layers_24_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1069] + alloc1507: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1506, model_decoder_layers_24_self_attn_q_proj_weight5, model_decoder_layers_24_self_attn_q_proj_bias5, alloc1507) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias5) + model_decoder_layers_24_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] + alloc1508: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1506, model_decoder_layers_24_self_attn_k_proj_weight5, alloc1508) + R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight5) + model_decoder_layers_24_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] + model_decoder_layers_24_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1067] + alloc1509: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1506, model_decoder_layers_24_self_attn_v_proj_weight5, model_decoder_layers_24_self_attn_v_proj_bias5, alloc1509) + R.vm.kill_object(alloc1506) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias5) + alloc1510: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1507, alloc1508, alloc1509, alloc1510) + R.vm.kill_object(alloc1507) + R.vm.kill_object(alloc1508) + R.vm.kill_object(alloc1509) + alloc1511: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1509: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), alloc1510, alloc1511) + R.vm.kill_object(alloc1510) + lv308: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1511, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1511) + model_decoder_layers_24_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] + model_decoder_layers_24_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1071] + alloc1512: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv308, model_decoder_layers_24_self_attn_out_proj_weight5, model_decoder_layers_24_self_attn_out_proj_bias5, alloc1505, alloc1512) + R.vm.kill_object(alloc1505) + R.vm.kill_object(lv308) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias5) + model_decoder_layers_24_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1081] + model_decoder_layers_24_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1082] + alloc1513: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1512, model_decoder_layers_24_encoder_attn_layer_norm_weight5, model_decoder_layers_24_encoder_attn_layer_norm_bias5, alloc1513) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias5) + model_decoder_layers_24_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] + model_decoder_layers_24_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1078] + alloc1514: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1513, model_decoder_layers_24_encoder_attn_q_proj_weight5, model_decoder_layers_24_encoder_attn_q_proj_bias5, alloc1514) + R.vm.kill_object(alloc1513) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias5) + lv311: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1514, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1514) + alloc1515: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1513: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), lv311, alloc1515) + R.vm.kill_object(lv311) + lv312: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1515, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1515) + model_decoder_layers_24_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] + model_decoder_layers_24_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1080] + alloc1516: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv312, model_decoder_layers_24_encoder_attn_out_proj_weight5, model_decoder_layers_24_encoder_attn_out_proj_bias5, alloc1512, alloc1516) + R.vm.kill_object(alloc1512) + R.vm.kill_object(lv312) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias5) + model_decoder_layers_24_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1087] + model_decoder_layers_24_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1088] + alloc1517: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1516, model_decoder_layers_24_final_layer_norm_weight5, model_decoder_layers_24_final_layer_norm_bias5, alloc1517) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias5) + model_decoder_layers_24_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] + model_decoder_layers_24_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1084] + alloc1518: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1517, model_decoder_layers_24_fc1_weight5, model_decoder_layers_24_fc1_bias5, alloc1518) + R.vm.kill_object(alloc1517) + R.vm.kill_object(model_decoder_layers_24_fc1_weight5) + R.vm.kill_object(model_decoder_layers_24_fc1_bias5) + model_decoder_layers_24_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] + model_decoder_layers_24_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1086] + alloc1519: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1518, model_decoder_layers_24_fc2_weight5, model_decoder_layers_24_fc2_bias5, alloc1516, alloc1519) + R.vm.kill_object(alloc1516) + R.vm.kill_object(alloc1518) + R.vm.kill_object(model_decoder_layers_24_fc2_weight5) + R.vm.kill_object(model_decoder_layers_24_fc2_bias5) + model_decoder_layers_25_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1096] + model_decoder_layers_25_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1097] + alloc1520: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1519, model_decoder_layers_25_self_attn_layer_norm_weight5, model_decoder_layers_25_self_attn_layer_norm_bias5, alloc1520) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias5) + model_decoder_layers_25_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] + model_decoder_layers_25_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1093] + alloc1521: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1520, model_decoder_layers_25_self_attn_q_proj_weight5, model_decoder_layers_25_self_attn_q_proj_bias5, alloc1521) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias5) + model_decoder_layers_25_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] + alloc1522: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1520, model_decoder_layers_25_self_attn_k_proj_weight5, alloc1522) + R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight5) + model_decoder_layers_25_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] + model_decoder_layers_25_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1091] + alloc1523: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1520, model_decoder_layers_25_self_attn_v_proj_weight5, model_decoder_layers_25_self_attn_v_proj_bias5, alloc1523) + R.vm.kill_object(alloc1520) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias5) + alloc1524: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1521, alloc1522, alloc1523, alloc1524) + R.vm.kill_object(alloc1521) + R.vm.kill_object(alloc1522) + R.vm.kill_object(alloc1523) + alloc1525: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1523: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), alloc1524, alloc1525) + R.vm.kill_object(alloc1524) + lv319: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1525, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1525) + model_decoder_layers_25_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] + model_decoder_layers_25_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1095] + alloc1526: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv319, model_decoder_layers_25_self_attn_out_proj_weight5, model_decoder_layers_25_self_attn_out_proj_bias5, alloc1519, alloc1526) + R.vm.kill_object(alloc1519) + R.vm.kill_object(lv319) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias5) + model_decoder_layers_25_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1105] + model_decoder_layers_25_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1106] + alloc1527: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1526, model_decoder_layers_25_encoder_attn_layer_norm_weight5, model_decoder_layers_25_encoder_attn_layer_norm_bias5, alloc1527) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias5) + model_decoder_layers_25_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] + model_decoder_layers_25_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1102] + alloc1528: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1527, model_decoder_layers_25_encoder_attn_q_proj_weight5, model_decoder_layers_25_encoder_attn_q_proj_bias5, alloc1528) + R.vm.kill_object(alloc1527) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias5) + lv322: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1528, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1528) + alloc1529: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1527: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), lv322, alloc1529) + R.vm.kill_object(lv322) + lv323: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1529, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1529) + model_decoder_layers_25_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] + model_decoder_layers_25_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1104] + alloc1530: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv323, model_decoder_layers_25_encoder_attn_out_proj_weight5, model_decoder_layers_25_encoder_attn_out_proj_bias5, alloc1526, alloc1530) + R.vm.kill_object(alloc1526) + R.vm.kill_object(lv323) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias5) + model_decoder_layers_25_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1111] + model_decoder_layers_25_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1112] + alloc1531: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1530, model_decoder_layers_25_final_layer_norm_weight5, model_decoder_layers_25_final_layer_norm_bias5, alloc1531) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias5) + model_decoder_layers_25_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] + model_decoder_layers_25_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1108] + alloc1532: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1531, model_decoder_layers_25_fc1_weight5, model_decoder_layers_25_fc1_bias5, alloc1532) + R.vm.kill_object(alloc1531) + R.vm.kill_object(model_decoder_layers_25_fc1_weight5) + R.vm.kill_object(model_decoder_layers_25_fc1_bias5) + model_decoder_layers_25_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] + model_decoder_layers_25_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1110] + alloc1533: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1532, model_decoder_layers_25_fc2_weight5, model_decoder_layers_25_fc2_bias5, alloc1530, alloc1533) + R.vm.kill_object(alloc1530) + R.vm.kill_object(alloc1532) + R.vm.kill_object(model_decoder_layers_25_fc2_weight5) + R.vm.kill_object(model_decoder_layers_25_fc2_bias5) + model_decoder_layers_26_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1120] + model_decoder_layers_26_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1121] + alloc1534: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1533, model_decoder_layers_26_self_attn_layer_norm_weight5, model_decoder_layers_26_self_attn_layer_norm_bias5, alloc1534) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias5) + model_decoder_layers_26_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] + model_decoder_layers_26_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1117] + alloc1535: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1534, model_decoder_layers_26_self_attn_q_proj_weight5, model_decoder_layers_26_self_attn_q_proj_bias5, alloc1535) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias5) + model_decoder_layers_26_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] + alloc1536: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1534, model_decoder_layers_26_self_attn_k_proj_weight5, alloc1536) + R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight5) + model_decoder_layers_26_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] + model_decoder_layers_26_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1115] + alloc1537: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1534, model_decoder_layers_26_self_attn_v_proj_weight5, model_decoder_layers_26_self_attn_v_proj_bias5, alloc1537) + R.vm.kill_object(alloc1534) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias5) + alloc1538: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1535, alloc1536, alloc1537, alloc1538) + R.vm.kill_object(alloc1535) + R.vm.kill_object(alloc1536) + R.vm.kill_object(alloc1537) + alloc1539: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1537: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), alloc1538, alloc1539) + R.vm.kill_object(alloc1538) + lv330: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1539, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1539) + model_decoder_layers_26_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] + model_decoder_layers_26_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1119] + alloc1540: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv330, model_decoder_layers_26_self_attn_out_proj_weight5, model_decoder_layers_26_self_attn_out_proj_bias5, alloc1533, alloc1540) + R.vm.kill_object(alloc1533) + R.vm.kill_object(lv330) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias5) + model_decoder_layers_26_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1129] + model_decoder_layers_26_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1130] + alloc1541: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1540, model_decoder_layers_26_encoder_attn_layer_norm_weight5, model_decoder_layers_26_encoder_attn_layer_norm_bias5, alloc1541) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias5) + model_decoder_layers_26_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] + model_decoder_layers_26_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1126] + alloc1542: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1541, model_decoder_layers_26_encoder_attn_q_proj_weight5, model_decoder_layers_26_encoder_attn_q_proj_bias5, alloc1542) + R.vm.kill_object(alloc1541) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias5) + lv333: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1542, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1542) + alloc1543: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1541: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), lv333, alloc1543) + R.vm.kill_object(lv333) + lv334: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1543, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1543) + model_decoder_layers_26_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] + model_decoder_layers_26_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1128] + alloc1544: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv334, model_decoder_layers_26_encoder_attn_out_proj_weight5, model_decoder_layers_26_encoder_attn_out_proj_bias5, alloc1540, alloc1544) + R.vm.kill_object(alloc1540) + R.vm.kill_object(lv334) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias5) + model_decoder_layers_26_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1135] + model_decoder_layers_26_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1136] + alloc1545: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1544, model_decoder_layers_26_final_layer_norm_weight5, model_decoder_layers_26_final_layer_norm_bias5, alloc1545) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias5) + model_decoder_layers_26_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] + model_decoder_layers_26_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1132] + alloc1546: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1545, model_decoder_layers_26_fc1_weight5, model_decoder_layers_26_fc1_bias5, alloc1546) + R.vm.kill_object(alloc1545) + R.vm.kill_object(model_decoder_layers_26_fc1_weight5) + R.vm.kill_object(model_decoder_layers_26_fc1_bias5) + model_decoder_layers_26_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] + model_decoder_layers_26_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1134] + alloc1547: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1546, model_decoder_layers_26_fc2_weight5, model_decoder_layers_26_fc2_bias5, alloc1544, alloc1547) + R.vm.kill_object(alloc1544) + R.vm.kill_object(alloc1546) + R.vm.kill_object(model_decoder_layers_26_fc2_weight5) + R.vm.kill_object(model_decoder_layers_26_fc2_bias5) + model_decoder_layers_27_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1144] + model_decoder_layers_27_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1145] + alloc1548: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1547, model_decoder_layers_27_self_attn_layer_norm_weight5, model_decoder_layers_27_self_attn_layer_norm_bias5, alloc1548) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias5) + model_decoder_layers_27_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] + model_decoder_layers_27_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1141] + alloc1549: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1548, model_decoder_layers_27_self_attn_q_proj_weight5, model_decoder_layers_27_self_attn_q_proj_bias5, alloc1549) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias5) + model_decoder_layers_27_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] + alloc1550: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1548, model_decoder_layers_27_self_attn_k_proj_weight5, alloc1550) + R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight5) + model_decoder_layers_27_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] + model_decoder_layers_27_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1139] + alloc1551: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1548, model_decoder_layers_27_self_attn_v_proj_weight5, model_decoder_layers_27_self_attn_v_proj_bias5, alloc1551) + R.vm.kill_object(alloc1548) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias5) + alloc1552: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1549, alloc1550, alloc1551, alloc1552) + R.vm.kill_object(alloc1549) + R.vm.kill_object(alloc1550) + R.vm.kill_object(alloc1551) + alloc1553: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1551: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), alloc1552, alloc1553) + R.vm.kill_object(alloc1552) + lv341: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1553, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1553) + model_decoder_layers_27_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] + model_decoder_layers_27_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1143] + alloc1554: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv341, model_decoder_layers_27_self_attn_out_proj_weight5, model_decoder_layers_27_self_attn_out_proj_bias5, alloc1547, alloc1554) + R.vm.kill_object(alloc1547) + R.vm.kill_object(lv341) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias5) + model_decoder_layers_27_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1153] + model_decoder_layers_27_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1154] + alloc1555: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1554, model_decoder_layers_27_encoder_attn_layer_norm_weight5, model_decoder_layers_27_encoder_attn_layer_norm_bias5, alloc1555) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias5) + model_decoder_layers_27_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] + model_decoder_layers_27_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1150] + alloc1556: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1555, model_decoder_layers_27_encoder_attn_q_proj_weight5, model_decoder_layers_27_encoder_attn_q_proj_bias5, alloc1556) + R.vm.kill_object(alloc1555) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias5) + lv344: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1556, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1556) + alloc1557: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1555: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), lv344, alloc1557) + R.vm.kill_object(lv344) + lv345: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1557, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1557) + model_decoder_layers_27_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] + model_decoder_layers_27_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1152] + alloc1558: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv345, model_decoder_layers_27_encoder_attn_out_proj_weight5, model_decoder_layers_27_encoder_attn_out_proj_bias5, alloc1554, alloc1558) + R.vm.kill_object(alloc1554) + R.vm.kill_object(lv345) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias5) + model_decoder_layers_27_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1159] + model_decoder_layers_27_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1160] + alloc1559: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1558, model_decoder_layers_27_final_layer_norm_weight5, model_decoder_layers_27_final_layer_norm_bias5, alloc1559) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias5) + model_decoder_layers_27_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] + model_decoder_layers_27_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1156] + alloc1560: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1559, model_decoder_layers_27_fc1_weight5, model_decoder_layers_27_fc1_bias5, alloc1560) + R.vm.kill_object(alloc1559) + R.vm.kill_object(model_decoder_layers_27_fc1_weight5) + R.vm.kill_object(model_decoder_layers_27_fc1_bias5) + model_decoder_layers_27_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] + model_decoder_layers_27_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1158] + alloc1561: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1560, model_decoder_layers_27_fc2_weight5, model_decoder_layers_27_fc2_bias5, alloc1558, alloc1561) + R.vm.kill_object(alloc1558) + R.vm.kill_object(alloc1560) + R.vm.kill_object(model_decoder_layers_27_fc2_weight5) + R.vm.kill_object(model_decoder_layers_27_fc2_bias5) + model_decoder_layers_28_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1168] + model_decoder_layers_28_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1169] + alloc1562: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1561, model_decoder_layers_28_self_attn_layer_norm_weight5, model_decoder_layers_28_self_attn_layer_norm_bias5, alloc1562) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias5) + model_decoder_layers_28_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] + model_decoder_layers_28_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1165] + alloc1563: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1562, model_decoder_layers_28_self_attn_q_proj_weight5, model_decoder_layers_28_self_attn_q_proj_bias5, alloc1563) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias5) + model_decoder_layers_28_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] + alloc1564: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1562, model_decoder_layers_28_self_attn_k_proj_weight5, alloc1564) + R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight5) + model_decoder_layers_28_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] + model_decoder_layers_28_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1163] + alloc1565: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1562, model_decoder_layers_28_self_attn_v_proj_weight5, model_decoder_layers_28_self_attn_v_proj_bias5, alloc1565) + R.vm.kill_object(alloc1562) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias5) + alloc1566: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1563, alloc1564, alloc1565, alloc1566) + R.vm.kill_object(alloc1563) + R.vm.kill_object(alloc1564) + R.vm.kill_object(alloc1565) + alloc1567: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1565: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), alloc1566, alloc1567) + R.vm.kill_object(alloc1566) + lv352: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1567, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1567) + model_decoder_layers_28_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] + model_decoder_layers_28_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1167] + alloc1568: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv352, model_decoder_layers_28_self_attn_out_proj_weight5, model_decoder_layers_28_self_attn_out_proj_bias5, alloc1561, alloc1568) + R.vm.kill_object(alloc1561) + R.vm.kill_object(lv352) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias5) + model_decoder_layers_28_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1177] + model_decoder_layers_28_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1178] + alloc1569: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1568, model_decoder_layers_28_encoder_attn_layer_norm_weight5, model_decoder_layers_28_encoder_attn_layer_norm_bias5, alloc1569) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias5) + model_decoder_layers_28_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] + model_decoder_layers_28_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1174] + alloc1570: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1569, model_decoder_layers_28_encoder_attn_q_proj_weight5, model_decoder_layers_28_encoder_attn_q_proj_bias5, alloc1570) + R.vm.kill_object(alloc1569) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias5) + lv355: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1570, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1570) + alloc1571: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1569: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), lv355, alloc1571) + R.vm.kill_object(lv355) + lv356: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1571, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1571) + model_decoder_layers_28_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] + model_decoder_layers_28_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1176] + alloc1572: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv356, model_decoder_layers_28_encoder_attn_out_proj_weight5, model_decoder_layers_28_encoder_attn_out_proj_bias5, alloc1568, alloc1572) + R.vm.kill_object(alloc1568) + R.vm.kill_object(lv356) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias5) + model_decoder_layers_28_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1183] + model_decoder_layers_28_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1184] + alloc1573: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1572, model_decoder_layers_28_final_layer_norm_weight5, model_decoder_layers_28_final_layer_norm_bias5, alloc1573) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias5) + model_decoder_layers_28_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] + model_decoder_layers_28_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1180] + alloc1574: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1573, model_decoder_layers_28_fc1_weight5, model_decoder_layers_28_fc1_bias5, alloc1574) + R.vm.kill_object(alloc1573) + R.vm.kill_object(model_decoder_layers_28_fc1_weight5) + R.vm.kill_object(model_decoder_layers_28_fc1_bias5) + model_decoder_layers_28_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] + model_decoder_layers_28_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1182] + alloc1575: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1574, model_decoder_layers_28_fc2_weight5, model_decoder_layers_28_fc2_bias5, alloc1572, alloc1575) + R.vm.kill_object(alloc1572) + R.vm.kill_object(alloc1574) + R.vm.kill_object(model_decoder_layers_28_fc2_weight5) + R.vm.kill_object(model_decoder_layers_28_fc2_bias5) + model_decoder_layers_29_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1192] + model_decoder_layers_29_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1193] + alloc1576: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1575, model_decoder_layers_29_self_attn_layer_norm_weight5, model_decoder_layers_29_self_attn_layer_norm_bias5, alloc1576) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias5) + model_decoder_layers_29_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] + model_decoder_layers_29_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1189] + alloc1577: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1576, model_decoder_layers_29_self_attn_q_proj_weight5, model_decoder_layers_29_self_attn_q_proj_bias5, alloc1577) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias5) + model_decoder_layers_29_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] + alloc1578: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1576, model_decoder_layers_29_self_attn_k_proj_weight5, alloc1578) + R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight5) + model_decoder_layers_29_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] + model_decoder_layers_29_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1187] + alloc1579: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1576, model_decoder_layers_29_self_attn_v_proj_weight5, model_decoder_layers_29_self_attn_v_proj_bias5, alloc1579) + R.vm.kill_object(alloc1576) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias5) + alloc1580: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1577, alloc1578, alloc1579, alloc1580) + R.vm.kill_object(alloc1577) + R.vm.kill_object(alloc1578) + R.vm.kill_object(alloc1579) + alloc1581: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1579: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), alloc1580, alloc1581) + R.vm.kill_object(alloc1580) + lv363: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1581, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1581) + model_decoder_layers_29_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] + model_decoder_layers_29_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1191] + alloc1582: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv363, model_decoder_layers_29_self_attn_out_proj_weight5, model_decoder_layers_29_self_attn_out_proj_bias5, alloc1575, alloc1582) + R.vm.kill_object(alloc1575) + R.vm.kill_object(lv363) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias5) + model_decoder_layers_29_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1201] + model_decoder_layers_29_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1202] + alloc1583: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1582, model_decoder_layers_29_encoder_attn_layer_norm_weight5, model_decoder_layers_29_encoder_attn_layer_norm_bias5, alloc1583) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias5) + model_decoder_layers_29_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] + model_decoder_layers_29_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1198] + alloc1584: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1583, model_decoder_layers_29_encoder_attn_q_proj_weight5, model_decoder_layers_29_encoder_attn_q_proj_bias5, alloc1584) + R.vm.kill_object(alloc1583) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias5) + lv366: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1584, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1584) + alloc1585: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1583: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), lv366, alloc1585) + R.vm.kill_object(lv366) + lv367: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1585, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1585) + model_decoder_layers_29_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] + model_decoder_layers_29_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1200] + alloc1586: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv367, model_decoder_layers_29_encoder_attn_out_proj_weight5, model_decoder_layers_29_encoder_attn_out_proj_bias5, alloc1582, alloc1586) + R.vm.kill_object(alloc1582) + R.vm.kill_object(lv367) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias5) + model_decoder_layers_29_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1207] + model_decoder_layers_29_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1208] + alloc1587: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1586, model_decoder_layers_29_final_layer_norm_weight5, model_decoder_layers_29_final_layer_norm_bias5, alloc1587) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias5) + model_decoder_layers_29_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] + model_decoder_layers_29_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1204] + alloc1588: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1587, model_decoder_layers_29_fc1_weight5, model_decoder_layers_29_fc1_bias5, alloc1588) + R.vm.kill_object(alloc1587) + R.vm.kill_object(model_decoder_layers_29_fc1_weight5) + R.vm.kill_object(model_decoder_layers_29_fc1_bias5) + model_decoder_layers_29_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] + model_decoder_layers_29_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1206] + alloc1589: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1588, model_decoder_layers_29_fc2_weight5, model_decoder_layers_29_fc2_bias5, alloc1586, alloc1589) + R.vm.kill_object(alloc1586) + R.vm.kill_object(alloc1588) + R.vm.kill_object(model_decoder_layers_29_fc2_weight5) + R.vm.kill_object(model_decoder_layers_29_fc2_bias5) + model_decoder_layers_30_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1216] + model_decoder_layers_30_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1217] + alloc1590: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1589, model_decoder_layers_30_self_attn_layer_norm_weight5, model_decoder_layers_30_self_attn_layer_norm_bias5, alloc1590) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias5) + model_decoder_layers_30_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] + model_decoder_layers_30_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1213] + alloc1591: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1590, model_decoder_layers_30_self_attn_q_proj_weight5, model_decoder_layers_30_self_attn_q_proj_bias5, alloc1591) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias5) + model_decoder_layers_30_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] + alloc1592: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1590, model_decoder_layers_30_self_attn_k_proj_weight5, alloc1592) + R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight5) + model_decoder_layers_30_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] + model_decoder_layers_30_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1211] + alloc1593: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1590, model_decoder_layers_30_self_attn_v_proj_weight5, model_decoder_layers_30_self_attn_v_proj_bias5, alloc1593) + R.vm.kill_object(alloc1590) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias5) + alloc1594: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1591, alloc1592, alloc1593, alloc1594) + R.vm.kill_object(alloc1591) + R.vm.kill_object(alloc1592) + R.vm.kill_object(alloc1593) + alloc1595: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1593: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), alloc1594, alloc1595) + R.vm.kill_object(alloc1594) + lv374: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1595, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1595) + model_decoder_layers_30_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] + model_decoder_layers_30_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1215] + alloc1596: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv374, model_decoder_layers_30_self_attn_out_proj_weight5, model_decoder_layers_30_self_attn_out_proj_bias5, alloc1589, alloc1596) + R.vm.kill_object(alloc1589) + R.vm.kill_object(lv374) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias5) + model_decoder_layers_30_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1225] + model_decoder_layers_30_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1226] + alloc1597: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1596, model_decoder_layers_30_encoder_attn_layer_norm_weight5, model_decoder_layers_30_encoder_attn_layer_norm_bias5, alloc1597) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias5) + model_decoder_layers_30_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] + model_decoder_layers_30_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1222] + alloc1598: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1597, model_decoder_layers_30_encoder_attn_q_proj_weight5, model_decoder_layers_30_encoder_attn_q_proj_bias5, alloc1598) + R.vm.kill_object(alloc1597) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias5) + lv377: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1598, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1598) + alloc1599: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1597: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), lv377, alloc1599) + R.vm.kill_object(lv377) + lv378: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1599, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1599) + model_decoder_layers_30_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] + model_decoder_layers_30_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1224] + alloc1600: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7_add6(lv378, model_decoder_layers_30_encoder_attn_out_proj_weight5, model_decoder_layers_30_encoder_attn_out_proj_bias5, alloc1596, alloc1600) + R.vm.kill_object(alloc1596) + R.vm.kill_object(lv378) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias5) + model_decoder_layers_30_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1231] + model_decoder_layers_30_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1232] + alloc1601: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1600, model_decoder_layers_30_final_layer_norm_weight5, model_decoder_layers_30_final_layer_norm_bias5, alloc1601) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias5) + model_decoder_layers_30_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] + model_decoder_layers_30_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1228] + alloc1602: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + cls.fused_NT_matmul1_add8_gelu2(alloc1601, model_decoder_layers_30_fc1_weight5, model_decoder_layers_30_fc1_bias5, alloc1602) + R.vm.kill_object(alloc1601) + R.vm.kill_object(model_decoder_layers_30_fc1_weight5) + R.vm.kill_object(model_decoder_layers_30_fc1_bias5) + model_decoder_layers_30_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] + model_decoder_layers_30_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1230] + alloc1603: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul2_add7_add6(alloc1602, model_decoder_layers_30_fc2_weight5, model_decoder_layers_30_fc2_bias5, alloc1600, alloc1603) + R.vm.kill_object(alloc1600) + R.vm.kill_object(alloc1602) + R.vm.kill_object(model_decoder_layers_30_fc2_weight5) + R.vm.kill_object(model_decoder_layers_30_fc2_bias5) + model_decoder_layers_31_self_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1240] + model_decoder_layers_31_self_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1241] + alloc1604: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1603, model_decoder_layers_31_self_attn_layer_norm_weight5, model_decoder_layers_31_self_attn_layer_norm_bias5, alloc1604) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias5) + model_decoder_layers_31_self_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] + model_decoder_layers_31_self_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1237] + alloc1605: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1604, model_decoder_layers_31_self_attn_q_proj_weight5, model_decoder_layers_31_self_attn_q_proj_bias5, alloc1605) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias5) + model_decoder_layers_31_self_attn_k_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] + alloc1606: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.NT_matmul(alloc1604, model_decoder_layers_31_self_attn_k_proj_weight5, alloc1606) + R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight5) + model_decoder_layers_31_self_attn_v_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] + model_decoder_layers_31_self_attn_v_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1235] + alloc1607: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1604, model_decoder_layers_31_self_attn_v_proj_weight5, model_decoder_layers_31_self_attn_v_proj_bias5, alloc1607) + R.vm.kill_object(alloc1604) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight5) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias5) + alloc1608: R.Tensor((1, 60, 64), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 60, 64]), R.dtype("float16")) + cls.fused_reshape21_reshape21_reshape21_concatenate2_reshape22(alloc1605, alloc1606, alloc1607, alloc1608) + R.vm.kill_object(alloc1605) + R.vm.kill_object(alloc1606) + R.vm.kill_object(alloc1607) + alloc1609: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1607: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), alloc1608, alloc1609) + R.vm.kill_object(alloc1608) + lv385: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1609, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1609) + model_decoder_layers_31_self_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] + model_decoder_layers_31_self_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1239] + alloc1610: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage22, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + R.vm.kill_object(storage22) + cls.fused_NT_matmul_add7_add6(lv385, model_decoder_layers_31_self_attn_out_proj_weight5, model_decoder_layers_31_self_attn_out_proj_bias5, alloc1603, alloc1610) + R.vm.kill_object(alloc1603) + R.vm.kill_object(lv385) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias5) + model_decoder_layers_31_encoder_attn_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1249] + model_decoder_layers_31_encoder_attn_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1250] + alloc1611: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1610, model_decoder_layers_31_encoder_attn_layer_norm_weight5, model_decoder_layers_31_encoder_attn_layer_norm_bias5, alloc1611) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias5) + model_decoder_layers_31_encoder_attn_q_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] + model_decoder_layers_31_encoder_attn_q_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1246] + alloc1612: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.fused_NT_matmul_add7(alloc1611, model_decoder_layers_31_encoder_attn_q_proj_weight5, model_decoder_layers_31_encoder_attn_q_proj_bias5, alloc1612) + R.vm.kill_object(alloc1611) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight5) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias5) + lv388: R.Tensor((1, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1612, R.shape([1, 20, 64]), sinfo_args=(R.Tensor((1, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1612) + alloc1613: R.Tensor((1, 20, 64), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 20, 64]), R.dtype("float16")) + _1611: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), lv388, alloc1613) + R.vm.kill_object(lv388) + lv389: R.Tensor((1, 1, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1613, R.shape([1, 1, 1280]), sinfo_args=(R.Tensor((1, 1, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1613) + model_decoder_layers_31_encoder_attn_out_proj_weight5: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] + model_decoder_layers_31_encoder_attn_out_proj_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1248] + alloc1614: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage20, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + R.vm.kill_object(storage20) + cls.fused_NT_matmul_add7_add6(lv389, model_decoder_layers_31_encoder_attn_out_proj_weight5, model_decoder_layers_31_encoder_attn_out_proj_bias5, alloc1610, alloc1614) + R.vm.kill_object(alloc1610) + R.vm.kill_object(lv389) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight5) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias5) + model_decoder_layers_31_final_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1255] + model_decoder_layers_31_final_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1256] + alloc1615: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + cls.layer_norm3(alloc1614, model_decoder_layers_31_final_layer_norm_weight5, model_decoder_layers_31_final_layer_norm_bias5, alloc1615) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight5) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias5) + model_decoder_layers_31_fc1_weight5: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] + model_decoder_layers_31_fc1_bias5: R.Tensor((5120,), dtype="float16") = packed_params[1252] + alloc1616: R.Tensor((1, 1, 5120), dtype="float16") = R.vm.alloc_tensor(storage19, R.prim_value(0), R.shape([1, 1, 5120]), R.dtype("float16")) + R.vm.kill_object(storage19) + cls.fused_NT_matmul1_add8_gelu2(alloc1615, model_decoder_layers_31_fc1_weight5, model_decoder_layers_31_fc1_bias5, alloc1616) + R.vm.kill_object(alloc1615) + R.vm.kill_object(model_decoder_layers_31_fc1_weight5) + R.vm.kill_object(model_decoder_layers_31_fc1_bias5) + model_decoder_layers_31_fc2_weight5: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] + model_decoder_layers_31_fc2_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1254] + alloc1617: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage21, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + R.vm.kill_object(storage21) + cls.fused_NT_matmul2_add7_add6(alloc1616, model_decoder_layers_31_fc2_weight5, model_decoder_layers_31_fc2_bias5, alloc1614, alloc1617) + R.vm.kill_object(alloc1614) + R.vm.kill_object(alloc1616) + R.vm.kill_object(model_decoder_layers_31_fc2_weight5) + R.vm.kill_object(model_decoder_layers_31_fc2_bias5) + model_decoder_layer_norm_weight5: R.Tensor((1280,), dtype="float16") = packed_params[1257] + model_decoder_layer_norm_bias5: R.Tensor((1280,), dtype="float16") = packed_params[1258] + alloc1618: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage23, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + R.vm.kill_object(storage23) + cls.layer_norm3(alloc1617, model_decoder_layer_norm_weight5, model_decoder_layer_norm_bias5, alloc1618) + R.vm.kill_object(alloc1617) + R.vm.kill_object(model_decoder_layer_norm_weight5) + R.vm.kill_object(model_decoder_layer_norm_bias5) + storage: R.Object = R.vm.alloc_storage(R.shape([207464]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc1619: R.Tensor((1, 1, 51866), dtype="float32") = R.vm.alloc_tensor(storage, R.prim_value(0), R.shape([1, 1, 51866]), R.dtype("float32")) + R.vm.kill_object(storage) + cls.NT_matmul3(alloc1618, model_decoder_embed_tokens_weight5, alloc1619) + R.vm.kill_object(model_decoder_embed_tokens_weight5) + R.vm.kill_object(alloc1618) + return alloc1619 + + @R.function + def multinomial_from_uniform(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), uniform_samples: R.Tensor(("num_samples",), dtype="float32"), sample_indices: R.Tensor(("num_samples",), dtype="int32")) -> R.Tensor(("num_samples",), dtype="int32"): + num_samples = T.int64() + batch_size = T.int64() + vocab_size = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[1], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[1], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=multinomial_from_uniform, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + gv6: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + uniform_samples_1: R.Tensor((num_samples, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", uniform_samples, gv6, sinfo_args=(R.Tensor((num_samples, 1), dtype="float32"),)) + gv7: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + sample_indices_1: R.Tensor((num_samples, 1), dtype="int32") = R.call_packed("vm.builtin.reshape", sample_indices, gv7, sinfo_args=(R.Tensor((num_samples, 1), dtype="int32"),)) + storage3: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv8: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc3: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage3, R.prim_value(0), gv8, R.dtype("int32")) + R.vm.kill_object(storage3) + cls.parallel_sampling_from_prob(probs, uniform_samples_1, sample_indices_1, alloc3) + R.vm.kill_object(uniform_samples_1) + R.vm.kill_object(sample_indices_1) + gv9: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) + gv: R.Tensor((num_samples,), dtype="int32") = R.call_packed("vm.builtin.reshape", alloc3, gv9, sinfo_args=(R.Tensor((num_samples,), dtype="int32"),)) + R.vm.kill_object(alloc3) + return gv + + @R.function + def prefill(input_ids: R.Tensor((1, "seq_len"), dtype="int32"), paged_kv_cache: R.Object, packed_params: R.Tuple(R.Tensor((1280, 128, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280, 3), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1500, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((51866, 1280), dtype="float16"), R.Tensor((448, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280, 1280), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((5120, 1280), dtype="float16"), R.Tensor((5120,), dtype="float16"), R.Tensor((1280, 5120), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"), R.Tensor((1280,), dtype="float16"))) -> R.Tensor((1, 1, 51866), dtype="float32"): + seq_len = T.int64() + R.func_attr({"num_input": 2, "relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(2),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", input_ids, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tuple_info", packed_params, R.prim_value(1259), R.str("ErrorContext(fn=prefill, loc=param[2], param=packed_params, annotation=R.Tuple(R.Tensor((1280, 128, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280, 3), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1500, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((51866, 1280), dtype=\"float16\"), R.Tensor((448, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280, 1280), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((5120, 1280), dtype=\"float16\"), R.Tensor((5120,), dtype=\"float16\"), R.Tensor((1280, 5120), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"), R.Tensor((1280,), dtype=\"float16\"))) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", input_ids, shape_heap, R.prim_value(2), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.str("ErrorContext(fn=prefill, loc=param[0], param=input_ids, annotation=R.Tensor((1, seq_len), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + model_decoder_embed_tokens_weight4: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + gv2580: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) + reshape1030: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.reshape", input_ids, gv2580, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) + model_decoder_embed_tokens_weight4_1: R.Tensor((51866, 1280), dtype="float16") = packed_params[487] + storage37: R.Object = R.vm.alloc_storage(R.shape([153600000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2581: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) + alloc1982: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2581, R.dtype("float16")) + cls.take(model_decoder_embed_tokens_weight4_1, reshape1030, alloc1982) + R.vm.kill_object(reshape1030) + R.vm.kill_object(model_decoder_embed_tokens_weight4_1) + gv2582: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1031: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1982, gv2582, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1982) + lv198: R.Tensor((seq_len,), dtype="int32") = R.call_packed("vm.builtin.attention_kv_cache_get_query_positions", paged_kv_cache, sinfo_args=(R.Tensor((seq_len,), dtype="int32"),)) + model_decoder_embed_positions_weight4: R.Tensor((448, 1280), dtype="float16") = packed_params[488] + storage38: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2583: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=2),)) + alloc1983: R.Tensor(dtype="float16", ndim=2) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2583, R.dtype("float16")) + cls.take1(model_decoder_embed_positions_weight4, lv198, alloc1983) + R.vm.kill_object(lv198) + R.vm.kill_object(model_decoder_embed_positions_weight4) + gv2584: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1032: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1983, gv2584, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(alloc1983) + storage39: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2585: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1984: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2585, R.dtype("float16")) + cls.add5(reshape1031, reshape1032, alloc1984) + R.vm.kill_object(reshape1031) + R.vm.kill_object(reshape1032) + model_decoder_layers_0_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[496] + model_decoder_layers_0_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[497] + gv2586: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1985: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2586, R.dtype("float16")) + cls.layer_norm2(alloc1984, model_decoder_layers_0_self_attn_layer_norm_weight4, model_decoder_layers_0_self_attn_layer_norm_bias4, alloc1985) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_0_self_attn_layer_norm_bias4) + model_decoder_layers_0_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[492] + model_decoder_layers_0_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[493] + gv2587: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1986: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2587, R.dtype("float16")) + _1985: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_q_proj_weight4, alloc1985, model_decoder_layers_0_self_attn_q_proj_bias4, alloc1986) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_0_self_attn_q_proj_bias4) + gv2588: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1033: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1986, gv2588, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1986) + model_decoder_layers_0_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[489] + storage40: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2589: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1987: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2589, R.dtype("float16")) + _1986: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_0_self_attn_k_proj_weight4, alloc1985, alloc1987) + R.vm.kill_object(model_decoder_layers_0_self_attn_k_proj_weight4) + gv2590: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1034: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1987, gv2590, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1987) + model_decoder_layers_0_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[490] + model_decoder_layers_0_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[491] + storage41: R.Object = R.vm.alloc_storage(R.shape([115200000]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2591: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1988: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2591, R.dtype("float16")) + _1987: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_v_proj_weight4, alloc1985, model_decoder_layers_0_self_attn_v_proj_bias4, alloc1988) + R.vm.kill_object(alloc1985) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_0_self_attn_v_proj_bias4) + gv2592: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1035: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1988, gv2592, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1988) + gv2593: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc1989: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2593, R.dtype("float16")) + cls.concatenate1(reshape1033, reshape1034, reshape1035, alloc1989) + R.vm.kill_object(reshape1033) + R.vm.kill_object(reshape1034) + R.vm.kill_object(reshape1035) + gv2594: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1036: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1989, gv2594, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc1989) + gv2595: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1990: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2595, R.dtype("float16")) + _1989: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape1036, alloc1990) + R.vm.kill_object(reshape1036) + gv2596: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1037: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1990, gv2596, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1990) + gv2597: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1038: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1037, gv2597, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1037) + model_decoder_layers_0_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[494] + model_decoder_layers_0_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[495] + gv2598: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1991: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2598, R.dtype("float16")) + _1990: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_self_attn_out_proj_weight4, reshape1038, model_decoder_layers_0_self_attn_out_proj_bias4, alloc1991) + R.vm.kill_object(reshape1038) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_0_self_attn_out_proj_bias4) + gv2599: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1992: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2599, R.dtype("float16")) + cls.add5(alloc1984, alloc1991, alloc1992) + R.vm.kill_object(alloc1984) + R.vm.kill_object(alloc1991) + model_decoder_layers_0_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[505] + model_decoder_layers_0_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[506] + gv2600: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1993: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2600, R.dtype("float16")) + cls.layer_norm2(alloc1992, model_decoder_layers_0_encoder_attn_layer_norm_weight4, model_decoder_layers_0_encoder_attn_layer_norm_bias4, alloc1993) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_layer_norm_bias4) + model_decoder_layers_0_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[501] + model_decoder_layers_0_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[502] + gv2601: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1994: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2601, R.dtype("float16")) + _1993: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_q_proj_weight4, alloc1993, model_decoder_layers_0_encoder_attn_q_proj_bias4, alloc1994) + R.vm.kill_object(alloc1993) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_q_proj_bias4) + gv2602: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1039: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1994, gv2602, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1994) + gv2603: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1040: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1039, gv2603, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1039) + gv2604: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc1995: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2604, R.dtype("float16")) + _1994: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(0), R.prim_value(T.float32(1)), reshape1040, alloc1995) + R.vm.kill_object(reshape1040) + gv2605: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1041: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc1995, gv2605, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc1995) + gv2606: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1042: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1041, gv2606, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1041) + model_decoder_layers_0_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[503] + model_decoder_layers_0_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[504] + gv2607: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1996: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2607, R.dtype("float16")) + _1995: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_0_encoder_attn_out_proj_weight4, reshape1042, model_decoder_layers_0_encoder_attn_out_proj_bias4, alloc1996) + R.vm.kill_object(reshape1042) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_0_encoder_attn_out_proj_bias4) + gv2608: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1997: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2608, R.dtype("float16")) + cls.add5(alloc1992, alloc1996, alloc1997) + R.vm.kill_object(alloc1992) + R.vm.kill_object(alloc1996) + model_decoder_layers_0_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[511] + model_decoder_layers_0_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[512] + gv2609: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc1998: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2609, R.dtype("float16")) + cls.layer_norm2(alloc1997, model_decoder_layers_0_final_layer_norm_weight4, model_decoder_layers_0_final_layer_norm_bias4, alloc1998) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_0_final_layer_norm_bias4) + model_decoder_layers_0_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[507] + model_decoder_layers_0_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[508] + gv2610: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc1999: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2610, R.dtype("float16")) + _1998: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_0_fc1_weight4, alloc1998, model_decoder_layers_0_fc1_bias4, alloc1999) + R.vm.kill_object(alloc1998) + R.vm.kill_object(model_decoder_layers_0_fc1_weight4) + R.vm.kill_object(model_decoder_layers_0_fc1_bias4) + model_decoder_layers_0_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[509] + model_decoder_layers_0_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[510] + gv2611: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2000: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2611, R.dtype("float16")) + _1999: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_0_fc2_weight4, alloc1999, model_decoder_layers_0_fc2_bias4, alloc2000) + R.vm.kill_object(alloc1999) + R.vm.kill_object(model_decoder_layers_0_fc2_weight4) + R.vm.kill_object(model_decoder_layers_0_fc2_bias4) + gv2612: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2001: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2612, R.dtype("float16")) + cls.add5(alloc1997, alloc2000, alloc2001) + R.vm.kill_object(alloc1997) + R.vm.kill_object(alloc2000) + model_decoder_layers_1_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[520] + model_decoder_layers_1_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[521] + gv2613: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2002: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2613, R.dtype("float16")) + cls.layer_norm2(alloc2001, model_decoder_layers_1_self_attn_layer_norm_weight4, model_decoder_layers_1_self_attn_layer_norm_bias4, alloc2002) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_1_self_attn_layer_norm_bias4) + model_decoder_layers_1_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[516] + model_decoder_layers_1_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[517] + gv2614: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2003: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2614, R.dtype("float16")) + _2002: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_q_proj_weight4, alloc2002, model_decoder_layers_1_self_attn_q_proj_bias4, alloc2003) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_1_self_attn_q_proj_bias4) + gv2615: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1043: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2003, gv2615, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2003) + model_decoder_layers_1_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[513] + gv2616: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2004: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2616, R.dtype("float16")) + _2003: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_1_self_attn_k_proj_weight4, alloc2002, alloc2004) + R.vm.kill_object(model_decoder_layers_1_self_attn_k_proj_weight4) + gv2617: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1044: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2004, gv2617, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2004) + model_decoder_layers_1_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[514] + model_decoder_layers_1_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[515] + gv2618: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2005: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2618, R.dtype("float16")) + _2004: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_v_proj_weight4, alloc2002, model_decoder_layers_1_self_attn_v_proj_bias4, alloc2005) + R.vm.kill_object(alloc2002) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_1_self_attn_v_proj_bias4) + gv2619: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1045: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2005, gv2619, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2005) + gv2620: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2006: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2620, R.dtype("float16")) + cls.concatenate1(reshape1043, reshape1044, reshape1045, alloc2006) + R.vm.kill_object(reshape1043) + R.vm.kill_object(reshape1044) + R.vm.kill_object(reshape1045) + gv2621: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1046: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2006, gv2621, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2006) + gv2622: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2007: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2622, R.dtype("float16")) + _2006: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape1046, alloc2007) + R.vm.kill_object(reshape1046) + gv2623: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1047: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2007, gv2623, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2007) + gv2624: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1048: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1047, gv2624, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1047) + model_decoder_layers_1_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[518] + model_decoder_layers_1_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[519] + gv2625: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2008: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2625, R.dtype("float16")) + _2007: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_self_attn_out_proj_weight4, reshape1048, model_decoder_layers_1_self_attn_out_proj_bias4, alloc2008) + R.vm.kill_object(reshape1048) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_1_self_attn_out_proj_bias4) + gv2626: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2009: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2626, R.dtype("float16")) + cls.add5(alloc2001, alloc2008, alloc2009) + R.vm.kill_object(alloc2001) + R.vm.kill_object(alloc2008) + model_decoder_layers_1_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[529] + model_decoder_layers_1_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[530] + gv2627: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2010: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2627, R.dtype("float16")) + cls.layer_norm2(alloc2009, model_decoder_layers_1_encoder_attn_layer_norm_weight4, model_decoder_layers_1_encoder_attn_layer_norm_bias4, alloc2010) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_layer_norm_bias4) + model_decoder_layers_1_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[525] + model_decoder_layers_1_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[526] + gv2628: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2011: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2628, R.dtype("float16")) + _2010: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_q_proj_weight4, alloc2010, model_decoder_layers_1_encoder_attn_q_proj_bias4, alloc2011) + R.vm.kill_object(alloc2010) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_q_proj_bias4) + gv2629: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1049: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2011, gv2629, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2011) + gv2630: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1050: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1049, gv2630, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1049) + gv2631: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2012: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2631, R.dtype("float16")) + _2011: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(1), R.prim_value(T.float32(1)), reshape1050, alloc2012) + R.vm.kill_object(reshape1050) + gv2632: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1051: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2012, gv2632, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2012) + gv2633: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1052: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1051, gv2633, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1051) + model_decoder_layers_1_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[527] + model_decoder_layers_1_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[528] + gv2634: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2013: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2634, R.dtype("float16")) + _2012: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_1_encoder_attn_out_proj_weight4, reshape1052, model_decoder_layers_1_encoder_attn_out_proj_bias4, alloc2013) + R.vm.kill_object(reshape1052) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_1_encoder_attn_out_proj_bias4) + gv2635: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2014: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2635, R.dtype("float16")) + cls.add5(alloc2009, alloc2013, alloc2014) + R.vm.kill_object(alloc2009) + R.vm.kill_object(alloc2013) + model_decoder_layers_1_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[535] + model_decoder_layers_1_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[536] + gv2636: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2015: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2636, R.dtype("float16")) + cls.layer_norm2(alloc2014, model_decoder_layers_1_final_layer_norm_weight4, model_decoder_layers_1_final_layer_norm_bias4, alloc2015) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_1_final_layer_norm_bias4) + model_decoder_layers_1_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[531] + model_decoder_layers_1_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[532] + gv2637: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2016: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2637, R.dtype("float16")) + _2015: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_1_fc1_weight4, alloc2015, model_decoder_layers_1_fc1_bias4, alloc2016) + R.vm.kill_object(alloc2015) + R.vm.kill_object(model_decoder_layers_1_fc1_weight4) + R.vm.kill_object(model_decoder_layers_1_fc1_bias4) + model_decoder_layers_1_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[533] + model_decoder_layers_1_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[534] + gv2638: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2017: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2638, R.dtype("float16")) + _2016: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_1_fc2_weight4, alloc2016, model_decoder_layers_1_fc2_bias4, alloc2017) + R.vm.kill_object(alloc2016) + R.vm.kill_object(model_decoder_layers_1_fc2_weight4) + R.vm.kill_object(model_decoder_layers_1_fc2_bias4) + gv2639: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2018: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2639, R.dtype("float16")) + cls.add5(alloc2014, alloc2017, alloc2018) + R.vm.kill_object(alloc2014) + R.vm.kill_object(alloc2017) + model_decoder_layers_2_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[544] + model_decoder_layers_2_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[545] + gv2640: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2019: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2640, R.dtype("float16")) + cls.layer_norm2(alloc2018, model_decoder_layers_2_self_attn_layer_norm_weight4, model_decoder_layers_2_self_attn_layer_norm_bias4, alloc2019) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_2_self_attn_layer_norm_bias4) + model_decoder_layers_2_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[540] + model_decoder_layers_2_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[541] + gv2641: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2020: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2641, R.dtype("float16")) + _2019: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_q_proj_weight4, alloc2019, model_decoder_layers_2_self_attn_q_proj_bias4, alloc2020) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_2_self_attn_q_proj_bias4) + gv2642: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1053: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2020, gv2642, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2020) + model_decoder_layers_2_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[537] + gv2643: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2021: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2643, R.dtype("float16")) + _2020: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_2_self_attn_k_proj_weight4, alloc2019, alloc2021) + R.vm.kill_object(model_decoder_layers_2_self_attn_k_proj_weight4) + gv2644: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1054: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2021, gv2644, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2021) + model_decoder_layers_2_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[538] + model_decoder_layers_2_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[539] + gv2645: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2022: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2645, R.dtype("float16")) + _2021: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_v_proj_weight4, alloc2019, model_decoder_layers_2_self_attn_v_proj_bias4, alloc2022) + R.vm.kill_object(alloc2019) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_2_self_attn_v_proj_bias4) + gv2646: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1055: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2022, gv2646, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2022) + gv2647: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2023: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2647, R.dtype("float16")) + cls.concatenate1(reshape1053, reshape1054, reshape1055, alloc2023) + R.vm.kill_object(reshape1053) + R.vm.kill_object(reshape1054) + R.vm.kill_object(reshape1055) + gv2648: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1056: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2023, gv2648, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2023) + gv2649: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2024: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2649, R.dtype("float16")) + _2023: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape1056, alloc2024) + R.vm.kill_object(reshape1056) + gv2650: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1057: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2024, gv2650, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2024) + gv2651: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1058: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1057, gv2651, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1057) + model_decoder_layers_2_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[542] + model_decoder_layers_2_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[543] + gv2652: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2025: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2652, R.dtype("float16")) + _2024: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_self_attn_out_proj_weight4, reshape1058, model_decoder_layers_2_self_attn_out_proj_bias4, alloc2025) + R.vm.kill_object(reshape1058) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_2_self_attn_out_proj_bias4) + gv2653: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2026: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2653, R.dtype("float16")) + cls.add5(alloc2018, alloc2025, alloc2026) + R.vm.kill_object(alloc2018) + R.vm.kill_object(alloc2025) + model_decoder_layers_2_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[553] + model_decoder_layers_2_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[554] + gv2654: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2027: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2654, R.dtype("float16")) + cls.layer_norm2(alloc2026, model_decoder_layers_2_encoder_attn_layer_norm_weight4, model_decoder_layers_2_encoder_attn_layer_norm_bias4, alloc2027) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_layer_norm_bias4) + model_decoder_layers_2_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[549] + model_decoder_layers_2_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[550] + gv2655: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2028: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2655, R.dtype("float16")) + _2027: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_q_proj_weight4, alloc2027, model_decoder_layers_2_encoder_attn_q_proj_bias4, alloc2028) + R.vm.kill_object(alloc2027) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_q_proj_bias4) + gv2656: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1059: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2028, gv2656, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2028) + gv2657: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1060: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1059, gv2657, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1059) + gv2658: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2029: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2658, R.dtype("float16")) + _2028: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(2), R.prim_value(T.float32(1)), reshape1060, alloc2029) + R.vm.kill_object(reshape1060) + gv2659: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1061: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2029, gv2659, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2029) + gv2660: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1062: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1061, gv2660, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1061) + model_decoder_layers_2_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[551] + model_decoder_layers_2_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[552] + gv2661: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2030: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2661, R.dtype("float16")) + _2029: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_2_encoder_attn_out_proj_weight4, reshape1062, model_decoder_layers_2_encoder_attn_out_proj_bias4, alloc2030) + R.vm.kill_object(reshape1062) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_2_encoder_attn_out_proj_bias4) + gv2662: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2031: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2662, R.dtype("float16")) + cls.add5(alloc2026, alloc2030, alloc2031) + R.vm.kill_object(alloc2026) + R.vm.kill_object(alloc2030) + model_decoder_layers_2_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[559] + model_decoder_layers_2_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[560] + gv2663: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2032: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2663, R.dtype("float16")) + cls.layer_norm2(alloc2031, model_decoder_layers_2_final_layer_norm_weight4, model_decoder_layers_2_final_layer_norm_bias4, alloc2032) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_2_final_layer_norm_bias4) + model_decoder_layers_2_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[555] + model_decoder_layers_2_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[556] + gv2664: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2033: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2664, R.dtype("float16")) + _2032: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_2_fc1_weight4, alloc2032, model_decoder_layers_2_fc1_bias4, alloc2033) + R.vm.kill_object(alloc2032) + R.vm.kill_object(model_decoder_layers_2_fc1_weight4) + R.vm.kill_object(model_decoder_layers_2_fc1_bias4) + model_decoder_layers_2_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[557] + model_decoder_layers_2_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[558] + gv2665: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2034: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2665, R.dtype("float16")) + _2033: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_2_fc2_weight4, alloc2033, model_decoder_layers_2_fc2_bias4, alloc2034) + R.vm.kill_object(alloc2033) + R.vm.kill_object(model_decoder_layers_2_fc2_weight4) + R.vm.kill_object(model_decoder_layers_2_fc2_bias4) + gv2666: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2035: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2666, R.dtype("float16")) + cls.add5(alloc2031, alloc2034, alloc2035) + R.vm.kill_object(alloc2031) + R.vm.kill_object(alloc2034) + model_decoder_layers_3_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[568] + model_decoder_layers_3_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[569] + gv2667: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2036: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2667, R.dtype("float16")) + cls.layer_norm2(alloc2035, model_decoder_layers_3_self_attn_layer_norm_weight4, model_decoder_layers_3_self_attn_layer_norm_bias4, alloc2036) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_3_self_attn_layer_norm_bias4) + model_decoder_layers_3_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[564] + model_decoder_layers_3_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[565] + gv2668: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2037: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2668, R.dtype("float16")) + _2036: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_q_proj_weight4, alloc2036, model_decoder_layers_3_self_attn_q_proj_bias4, alloc2037) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_3_self_attn_q_proj_bias4) + gv2669: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1063: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2037, gv2669, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2037) + model_decoder_layers_3_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[561] + gv2670: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2038: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2670, R.dtype("float16")) + _2037: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_3_self_attn_k_proj_weight4, alloc2036, alloc2038) + R.vm.kill_object(model_decoder_layers_3_self_attn_k_proj_weight4) + gv2671: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1064: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2038, gv2671, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2038) + model_decoder_layers_3_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[562] + model_decoder_layers_3_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[563] + gv2672: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2039: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2672, R.dtype("float16")) + _2038: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_v_proj_weight4, alloc2036, model_decoder_layers_3_self_attn_v_proj_bias4, alloc2039) + R.vm.kill_object(alloc2036) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_3_self_attn_v_proj_bias4) + gv2673: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1065: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2039, gv2673, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2039) + gv2674: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2040: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2674, R.dtype("float16")) + cls.concatenate1(reshape1063, reshape1064, reshape1065, alloc2040) + R.vm.kill_object(reshape1063) + R.vm.kill_object(reshape1064) + R.vm.kill_object(reshape1065) + gv2675: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1066: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2040, gv2675, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2040) + gv2676: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2041: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2676, R.dtype("float16")) + _2040: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape1066, alloc2041) + R.vm.kill_object(reshape1066) + gv2677: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1067: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2041, gv2677, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2041) + gv2678: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1068: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1067, gv2678, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1067) + model_decoder_layers_3_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[566] + model_decoder_layers_3_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[567] + gv2679: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2042: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2679, R.dtype("float16")) + _2041: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_self_attn_out_proj_weight4, reshape1068, model_decoder_layers_3_self_attn_out_proj_bias4, alloc2042) + R.vm.kill_object(reshape1068) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_3_self_attn_out_proj_bias4) + gv2680: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2043: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2680, R.dtype("float16")) + cls.add5(alloc2035, alloc2042, alloc2043) + R.vm.kill_object(alloc2035) + R.vm.kill_object(alloc2042) + model_decoder_layers_3_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[577] + model_decoder_layers_3_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[578] + gv2681: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2044: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2681, R.dtype("float16")) + cls.layer_norm2(alloc2043, model_decoder_layers_3_encoder_attn_layer_norm_weight4, model_decoder_layers_3_encoder_attn_layer_norm_bias4, alloc2044) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_layer_norm_bias4) + model_decoder_layers_3_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[573] + model_decoder_layers_3_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[574] + gv2682: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2045: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2682, R.dtype("float16")) + _2044: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_q_proj_weight4, alloc2044, model_decoder_layers_3_encoder_attn_q_proj_bias4, alloc2045) + R.vm.kill_object(alloc2044) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_q_proj_bias4) + gv2683: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1069: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2045, gv2683, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2045) + gv2684: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1070: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1069, gv2684, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1069) + gv2685: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2046: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2685, R.dtype("float16")) + _2045: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(3), R.prim_value(T.float32(1)), reshape1070, alloc2046) + R.vm.kill_object(reshape1070) + gv2686: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1071: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2046, gv2686, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2046) + gv2687: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1072: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1071, gv2687, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1071) + model_decoder_layers_3_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[575] + model_decoder_layers_3_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[576] + gv2688: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2047: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2688, R.dtype("float16")) + _2046: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_3_encoder_attn_out_proj_weight4, reshape1072, model_decoder_layers_3_encoder_attn_out_proj_bias4, alloc2047) + R.vm.kill_object(reshape1072) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_3_encoder_attn_out_proj_bias4) + gv2689: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2048: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2689, R.dtype("float16")) + cls.add5(alloc2043, alloc2047, alloc2048) + R.vm.kill_object(alloc2043) + R.vm.kill_object(alloc2047) + model_decoder_layers_3_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[583] + model_decoder_layers_3_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[584] + gv2690: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2049: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2690, R.dtype("float16")) + cls.layer_norm2(alloc2048, model_decoder_layers_3_final_layer_norm_weight4, model_decoder_layers_3_final_layer_norm_bias4, alloc2049) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_3_final_layer_norm_bias4) + model_decoder_layers_3_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[579] + model_decoder_layers_3_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[580] + gv2691: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2050: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2691, R.dtype("float16")) + _2049: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_3_fc1_weight4, alloc2049, model_decoder_layers_3_fc1_bias4, alloc2050) + R.vm.kill_object(alloc2049) + R.vm.kill_object(model_decoder_layers_3_fc1_weight4) + R.vm.kill_object(model_decoder_layers_3_fc1_bias4) + model_decoder_layers_3_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[581] + model_decoder_layers_3_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[582] + gv2692: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2051: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2692, R.dtype("float16")) + _2050: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_3_fc2_weight4, alloc2050, model_decoder_layers_3_fc2_bias4, alloc2051) + R.vm.kill_object(alloc2050) + R.vm.kill_object(model_decoder_layers_3_fc2_weight4) + R.vm.kill_object(model_decoder_layers_3_fc2_bias4) + gv2693: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2052: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2693, R.dtype("float16")) + cls.add5(alloc2048, alloc2051, alloc2052) + R.vm.kill_object(alloc2048) + R.vm.kill_object(alloc2051) + model_decoder_layers_4_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[592] + model_decoder_layers_4_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[593] + gv2694: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2053: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2694, R.dtype("float16")) + cls.layer_norm2(alloc2052, model_decoder_layers_4_self_attn_layer_norm_weight4, model_decoder_layers_4_self_attn_layer_norm_bias4, alloc2053) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_4_self_attn_layer_norm_bias4) + model_decoder_layers_4_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[588] + model_decoder_layers_4_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[589] + gv2695: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2054: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2695, R.dtype("float16")) + _2053: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_q_proj_weight4, alloc2053, model_decoder_layers_4_self_attn_q_proj_bias4, alloc2054) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_4_self_attn_q_proj_bias4) + gv2696: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1073: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2054, gv2696, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2054) + model_decoder_layers_4_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[585] + gv2697: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2055: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2697, R.dtype("float16")) + _2054: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_4_self_attn_k_proj_weight4, alloc2053, alloc2055) + R.vm.kill_object(model_decoder_layers_4_self_attn_k_proj_weight4) + gv2698: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1074: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2055, gv2698, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2055) + model_decoder_layers_4_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[586] + model_decoder_layers_4_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[587] + gv2699: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2056: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2699, R.dtype("float16")) + _2055: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_v_proj_weight4, alloc2053, model_decoder_layers_4_self_attn_v_proj_bias4, alloc2056) + R.vm.kill_object(alloc2053) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_4_self_attn_v_proj_bias4) + gv2700: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1075: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2056, gv2700, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2056) + gv2701: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2057: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2701, R.dtype("float16")) + cls.concatenate1(reshape1073, reshape1074, reshape1075, alloc2057) + R.vm.kill_object(reshape1073) + R.vm.kill_object(reshape1074) + R.vm.kill_object(reshape1075) + gv2702: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1076: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2057, gv2702, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2057) + gv2703: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2058: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2703, R.dtype("float16")) + _2057: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape1076, alloc2058) + R.vm.kill_object(reshape1076) + gv2704: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1077: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2058, gv2704, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2058) + gv2705: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1078: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1077, gv2705, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1077) + model_decoder_layers_4_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[590] + model_decoder_layers_4_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[591] + gv2706: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2059: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2706, R.dtype("float16")) + _2058: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_self_attn_out_proj_weight4, reshape1078, model_decoder_layers_4_self_attn_out_proj_bias4, alloc2059) + R.vm.kill_object(reshape1078) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_4_self_attn_out_proj_bias4) + gv2707: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2060: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2707, R.dtype("float16")) + cls.add5(alloc2052, alloc2059, alloc2060) + R.vm.kill_object(alloc2052) + R.vm.kill_object(alloc2059) + model_decoder_layers_4_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[601] + model_decoder_layers_4_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[602] + gv2708: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2061: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2708, R.dtype("float16")) + cls.layer_norm2(alloc2060, model_decoder_layers_4_encoder_attn_layer_norm_weight4, model_decoder_layers_4_encoder_attn_layer_norm_bias4, alloc2061) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_layer_norm_bias4) + model_decoder_layers_4_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[597] + model_decoder_layers_4_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[598] + gv2709: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2062: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2709, R.dtype("float16")) + _2061: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_q_proj_weight4, alloc2061, model_decoder_layers_4_encoder_attn_q_proj_bias4, alloc2062) + R.vm.kill_object(alloc2061) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_q_proj_bias4) + gv2710: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1079: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2062, gv2710, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2062) + gv2711: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1080: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1079, gv2711, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1079) + gv2712: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2063: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2712, R.dtype("float16")) + _2062: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(4), R.prim_value(T.float32(1)), reshape1080, alloc2063) + R.vm.kill_object(reshape1080) + gv2713: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1081: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2063, gv2713, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2063) + gv2714: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1082: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1081, gv2714, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1081) + model_decoder_layers_4_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[599] + model_decoder_layers_4_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[600] + gv2715: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2064: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2715, R.dtype("float16")) + _2063: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_4_encoder_attn_out_proj_weight4, reshape1082, model_decoder_layers_4_encoder_attn_out_proj_bias4, alloc2064) + R.vm.kill_object(reshape1082) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_4_encoder_attn_out_proj_bias4) + gv2716: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2065: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2716, R.dtype("float16")) + cls.add5(alloc2060, alloc2064, alloc2065) + R.vm.kill_object(alloc2060) + R.vm.kill_object(alloc2064) + model_decoder_layers_4_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[607] + model_decoder_layers_4_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[608] + gv2717: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2066: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2717, R.dtype("float16")) + cls.layer_norm2(alloc2065, model_decoder_layers_4_final_layer_norm_weight4, model_decoder_layers_4_final_layer_norm_bias4, alloc2066) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_4_final_layer_norm_bias4) + model_decoder_layers_4_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[603] + model_decoder_layers_4_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[604] + gv2718: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2067: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2718, R.dtype("float16")) + _2066: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_4_fc1_weight4, alloc2066, model_decoder_layers_4_fc1_bias4, alloc2067) + R.vm.kill_object(alloc2066) + R.vm.kill_object(model_decoder_layers_4_fc1_weight4) + R.vm.kill_object(model_decoder_layers_4_fc1_bias4) + model_decoder_layers_4_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[605] + model_decoder_layers_4_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[606] + gv2719: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2068: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2719, R.dtype("float16")) + _2067: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_4_fc2_weight4, alloc2067, model_decoder_layers_4_fc2_bias4, alloc2068) + R.vm.kill_object(alloc2067) + R.vm.kill_object(model_decoder_layers_4_fc2_weight4) + R.vm.kill_object(model_decoder_layers_4_fc2_bias4) + gv2720: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2069: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2720, R.dtype("float16")) + cls.add5(alloc2065, alloc2068, alloc2069) + R.vm.kill_object(alloc2065) + R.vm.kill_object(alloc2068) + model_decoder_layers_5_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[616] + model_decoder_layers_5_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[617] + gv2721: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2070: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2721, R.dtype("float16")) + cls.layer_norm2(alloc2069, model_decoder_layers_5_self_attn_layer_norm_weight4, model_decoder_layers_5_self_attn_layer_norm_bias4, alloc2070) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_5_self_attn_layer_norm_bias4) + model_decoder_layers_5_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[612] + model_decoder_layers_5_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[613] + gv2722: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2071: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2722, R.dtype("float16")) + _2070: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_q_proj_weight4, alloc2070, model_decoder_layers_5_self_attn_q_proj_bias4, alloc2071) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_5_self_attn_q_proj_bias4) + gv2723: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1083: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2071, gv2723, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2071) + model_decoder_layers_5_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[609] + gv2724: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2072: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2724, R.dtype("float16")) + _2071: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_5_self_attn_k_proj_weight4, alloc2070, alloc2072) + R.vm.kill_object(model_decoder_layers_5_self_attn_k_proj_weight4) + gv2725: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1084: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2072, gv2725, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2072) + model_decoder_layers_5_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[610] + model_decoder_layers_5_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[611] + gv2726: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2073: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2726, R.dtype("float16")) + _2072: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_v_proj_weight4, alloc2070, model_decoder_layers_5_self_attn_v_proj_bias4, alloc2073) + R.vm.kill_object(alloc2070) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_5_self_attn_v_proj_bias4) + gv2727: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1085: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2073, gv2727, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2073) + gv2728: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2074: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2728, R.dtype("float16")) + cls.concatenate1(reshape1083, reshape1084, reshape1085, alloc2074) + R.vm.kill_object(reshape1083) + R.vm.kill_object(reshape1084) + R.vm.kill_object(reshape1085) + gv2729: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1086: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2074, gv2729, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2074) + gv2730: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2075: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2730, R.dtype("float16")) + _2074: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape1086, alloc2075) + R.vm.kill_object(reshape1086) + gv2731: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1087: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2075, gv2731, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2075) + gv2732: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1088: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1087, gv2732, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1087) + model_decoder_layers_5_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[614] + model_decoder_layers_5_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[615] + gv2733: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2076: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2733, R.dtype("float16")) + _2075: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_self_attn_out_proj_weight4, reshape1088, model_decoder_layers_5_self_attn_out_proj_bias4, alloc2076) + R.vm.kill_object(reshape1088) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_5_self_attn_out_proj_bias4) + gv2734: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2077: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2734, R.dtype("float16")) + cls.add5(alloc2069, alloc2076, alloc2077) + R.vm.kill_object(alloc2069) + R.vm.kill_object(alloc2076) + model_decoder_layers_5_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[625] + model_decoder_layers_5_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[626] + gv2735: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2078: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2735, R.dtype("float16")) + cls.layer_norm2(alloc2077, model_decoder_layers_5_encoder_attn_layer_norm_weight4, model_decoder_layers_5_encoder_attn_layer_norm_bias4, alloc2078) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_layer_norm_bias4) + model_decoder_layers_5_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[621] + model_decoder_layers_5_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[622] + gv2736: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2079: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2736, R.dtype("float16")) + _2078: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_q_proj_weight4, alloc2078, model_decoder_layers_5_encoder_attn_q_proj_bias4, alloc2079) + R.vm.kill_object(alloc2078) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_q_proj_bias4) + gv2737: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1089: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2079, gv2737, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2079) + gv2738: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1090: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1089, gv2738, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1089) + gv2739: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2080: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2739, R.dtype("float16")) + _2079: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(5), R.prim_value(T.float32(1)), reshape1090, alloc2080) + R.vm.kill_object(reshape1090) + gv2740: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1091: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2080, gv2740, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2080) + gv2741: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1092: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1091, gv2741, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1091) + model_decoder_layers_5_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[623] + model_decoder_layers_5_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[624] + gv2742: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2081: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2742, R.dtype("float16")) + _2080: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_5_encoder_attn_out_proj_weight4, reshape1092, model_decoder_layers_5_encoder_attn_out_proj_bias4, alloc2081) + R.vm.kill_object(reshape1092) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_5_encoder_attn_out_proj_bias4) + gv2743: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2082: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2743, R.dtype("float16")) + cls.add5(alloc2077, alloc2081, alloc2082) + R.vm.kill_object(alloc2077) + R.vm.kill_object(alloc2081) + model_decoder_layers_5_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[631] + model_decoder_layers_5_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[632] + gv2744: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2083: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2744, R.dtype("float16")) + cls.layer_norm2(alloc2082, model_decoder_layers_5_final_layer_norm_weight4, model_decoder_layers_5_final_layer_norm_bias4, alloc2083) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_5_final_layer_norm_bias4) + model_decoder_layers_5_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[627] + model_decoder_layers_5_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[628] + gv2745: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2084: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2745, R.dtype("float16")) + _2083: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_5_fc1_weight4, alloc2083, model_decoder_layers_5_fc1_bias4, alloc2084) + R.vm.kill_object(alloc2083) + R.vm.kill_object(model_decoder_layers_5_fc1_weight4) + R.vm.kill_object(model_decoder_layers_5_fc1_bias4) + model_decoder_layers_5_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[629] + model_decoder_layers_5_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[630] + gv2746: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2085: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2746, R.dtype("float16")) + _2084: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_5_fc2_weight4, alloc2084, model_decoder_layers_5_fc2_bias4, alloc2085) + R.vm.kill_object(alloc2084) + R.vm.kill_object(model_decoder_layers_5_fc2_weight4) + R.vm.kill_object(model_decoder_layers_5_fc2_bias4) + gv2747: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2086: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2747, R.dtype("float16")) + cls.add5(alloc2082, alloc2085, alloc2086) + R.vm.kill_object(alloc2082) + R.vm.kill_object(alloc2085) + model_decoder_layers_6_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[640] + model_decoder_layers_6_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[641] + gv2748: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2087: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2748, R.dtype("float16")) + cls.layer_norm2(alloc2086, model_decoder_layers_6_self_attn_layer_norm_weight4, model_decoder_layers_6_self_attn_layer_norm_bias4, alloc2087) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_6_self_attn_layer_norm_bias4) + model_decoder_layers_6_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[636] + model_decoder_layers_6_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[637] + gv2749: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2088: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2749, R.dtype("float16")) + _2087: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_q_proj_weight4, alloc2087, model_decoder_layers_6_self_attn_q_proj_bias4, alloc2088) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_6_self_attn_q_proj_bias4) + gv2750: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1093: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2088, gv2750, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2088) + model_decoder_layers_6_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[633] + gv2751: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2089: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2751, R.dtype("float16")) + _2088: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_6_self_attn_k_proj_weight4, alloc2087, alloc2089) + R.vm.kill_object(model_decoder_layers_6_self_attn_k_proj_weight4) + gv2752: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1094: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2089, gv2752, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2089) + model_decoder_layers_6_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[634] + model_decoder_layers_6_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[635] + gv2753: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2090: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2753, R.dtype("float16")) + _2089: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_v_proj_weight4, alloc2087, model_decoder_layers_6_self_attn_v_proj_bias4, alloc2090) + R.vm.kill_object(alloc2087) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_6_self_attn_v_proj_bias4) + gv2754: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1095: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2090, gv2754, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2090) + gv2755: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2091: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2755, R.dtype("float16")) + cls.concatenate1(reshape1093, reshape1094, reshape1095, alloc2091) + R.vm.kill_object(reshape1093) + R.vm.kill_object(reshape1094) + R.vm.kill_object(reshape1095) + gv2756: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1096: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2091, gv2756, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2091) + gv2757: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2092: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2757, R.dtype("float16")) + _2091: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape1096, alloc2092) + R.vm.kill_object(reshape1096) + gv2758: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1097: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2092, gv2758, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2092) + gv2759: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1098: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1097, gv2759, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1097) + model_decoder_layers_6_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[638] + model_decoder_layers_6_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[639] + gv2760: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2093: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2760, R.dtype("float16")) + _2092: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_self_attn_out_proj_weight4, reshape1098, model_decoder_layers_6_self_attn_out_proj_bias4, alloc2093) + R.vm.kill_object(reshape1098) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_6_self_attn_out_proj_bias4) + gv2761: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2094: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2761, R.dtype("float16")) + cls.add5(alloc2086, alloc2093, alloc2094) + R.vm.kill_object(alloc2086) + R.vm.kill_object(alloc2093) + model_decoder_layers_6_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[649] + model_decoder_layers_6_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[650] + gv2762: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2095: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2762, R.dtype("float16")) + cls.layer_norm2(alloc2094, model_decoder_layers_6_encoder_attn_layer_norm_weight4, model_decoder_layers_6_encoder_attn_layer_norm_bias4, alloc2095) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_layer_norm_bias4) + model_decoder_layers_6_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[645] + model_decoder_layers_6_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[646] + gv2763: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2096: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2763, R.dtype("float16")) + _2095: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_q_proj_weight4, alloc2095, model_decoder_layers_6_encoder_attn_q_proj_bias4, alloc2096) + R.vm.kill_object(alloc2095) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_q_proj_bias4) + gv2764: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1099: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2096, gv2764, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2096) + gv2765: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1100: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1099, gv2765, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1099) + gv2766: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2097: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2766, R.dtype("float16")) + _2096: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(6), R.prim_value(T.float32(1)), reshape1100, alloc2097) + R.vm.kill_object(reshape1100) + gv2767: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1101: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2097, gv2767, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2097) + gv2768: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1102: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1101, gv2768, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1101) + model_decoder_layers_6_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[647] + model_decoder_layers_6_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[648] + gv2769: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2098: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2769, R.dtype("float16")) + _2097: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_6_encoder_attn_out_proj_weight4, reshape1102, model_decoder_layers_6_encoder_attn_out_proj_bias4, alloc2098) + R.vm.kill_object(reshape1102) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_6_encoder_attn_out_proj_bias4) + gv2770: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2099: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2770, R.dtype("float16")) + cls.add5(alloc2094, alloc2098, alloc2099) + R.vm.kill_object(alloc2094) + R.vm.kill_object(alloc2098) + model_decoder_layers_6_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[655] + model_decoder_layers_6_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[656] + gv2771: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2100: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2771, R.dtype("float16")) + cls.layer_norm2(alloc2099, model_decoder_layers_6_final_layer_norm_weight4, model_decoder_layers_6_final_layer_norm_bias4, alloc2100) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_6_final_layer_norm_bias4) + model_decoder_layers_6_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[651] + model_decoder_layers_6_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[652] + gv2772: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2101: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2772, R.dtype("float16")) + _2100: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_6_fc1_weight4, alloc2100, model_decoder_layers_6_fc1_bias4, alloc2101) + R.vm.kill_object(alloc2100) + R.vm.kill_object(model_decoder_layers_6_fc1_weight4) + R.vm.kill_object(model_decoder_layers_6_fc1_bias4) + model_decoder_layers_6_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[653] + model_decoder_layers_6_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[654] + gv2773: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2102: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2773, R.dtype("float16")) + _2101: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_6_fc2_weight4, alloc2101, model_decoder_layers_6_fc2_bias4, alloc2102) + R.vm.kill_object(alloc2101) + R.vm.kill_object(model_decoder_layers_6_fc2_weight4) + R.vm.kill_object(model_decoder_layers_6_fc2_bias4) + gv2774: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2103: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2774, R.dtype("float16")) + cls.add5(alloc2099, alloc2102, alloc2103) + R.vm.kill_object(alloc2099) + R.vm.kill_object(alloc2102) + model_decoder_layers_7_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[664] + model_decoder_layers_7_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[665] + gv2775: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2104: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2775, R.dtype("float16")) + cls.layer_norm2(alloc2103, model_decoder_layers_7_self_attn_layer_norm_weight4, model_decoder_layers_7_self_attn_layer_norm_bias4, alloc2104) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_7_self_attn_layer_norm_bias4) + model_decoder_layers_7_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[660] + model_decoder_layers_7_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[661] + gv2776: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2105: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2776, R.dtype("float16")) + _2104: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_q_proj_weight4, alloc2104, model_decoder_layers_7_self_attn_q_proj_bias4, alloc2105) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_7_self_attn_q_proj_bias4) + gv2777: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1103: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2105, gv2777, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2105) + model_decoder_layers_7_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[657] + gv2778: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2106: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2778, R.dtype("float16")) + _2105: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_7_self_attn_k_proj_weight4, alloc2104, alloc2106) + R.vm.kill_object(model_decoder_layers_7_self_attn_k_proj_weight4) + gv2779: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1104: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2106, gv2779, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2106) + model_decoder_layers_7_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[658] + model_decoder_layers_7_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[659] + gv2780: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2107: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2780, R.dtype("float16")) + _2106: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_v_proj_weight4, alloc2104, model_decoder_layers_7_self_attn_v_proj_bias4, alloc2107) + R.vm.kill_object(alloc2104) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_7_self_attn_v_proj_bias4) + gv2781: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1105: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2107, gv2781, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2107) + gv2782: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2108: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2782, R.dtype("float16")) + cls.concatenate1(reshape1103, reshape1104, reshape1105, alloc2108) + R.vm.kill_object(reshape1103) + R.vm.kill_object(reshape1104) + R.vm.kill_object(reshape1105) + gv2783: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1106: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2108, gv2783, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2108) + gv2784: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2109: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2784, R.dtype("float16")) + _2108: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape1106, alloc2109) + R.vm.kill_object(reshape1106) + gv2785: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1107: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2109, gv2785, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2109) + gv2786: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1108: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1107, gv2786, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1107) + model_decoder_layers_7_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[662] + model_decoder_layers_7_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[663] + gv2787: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2110: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2787, R.dtype("float16")) + _2109: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_self_attn_out_proj_weight4, reshape1108, model_decoder_layers_7_self_attn_out_proj_bias4, alloc2110) + R.vm.kill_object(reshape1108) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_7_self_attn_out_proj_bias4) + gv2788: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2111: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2788, R.dtype("float16")) + cls.add5(alloc2103, alloc2110, alloc2111) + R.vm.kill_object(alloc2103) + R.vm.kill_object(alloc2110) + model_decoder_layers_7_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[673] + model_decoder_layers_7_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[674] + gv2789: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2112: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2789, R.dtype("float16")) + cls.layer_norm2(alloc2111, model_decoder_layers_7_encoder_attn_layer_norm_weight4, model_decoder_layers_7_encoder_attn_layer_norm_bias4, alloc2112) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_layer_norm_bias4) + model_decoder_layers_7_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[669] + model_decoder_layers_7_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[670] + gv2790: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2113: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2790, R.dtype("float16")) + _2112: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_q_proj_weight4, alloc2112, model_decoder_layers_7_encoder_attn_q_proj_bias4, alloc2113) + R.vm.kill_object(alloc2112) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_q_proj_bias4) + gv2791: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1109: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2113, gv2791, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2113) + gv2792: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1110: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1109, gv2792, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1109) + gv2793: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2114: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2793, R.dtype("float16")) + _2113: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(7), R.prim_value(T.float32(1)), reshape1110, alloc2114) + R.vm.kill_object(reshape1110) + gv2794: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1111: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2114, gv2794, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2114) + gv2795: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1112: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1111, gv2795, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1111) + model_decoder_layers_7_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[671] + model_decoder_layers_7_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[672] + gv2796: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2115: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2796, R.dtype("float16")) + _2114: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_7_encoder_attn_out_proj_weight4, reshape1112, model_decoder_layers_7_encoder_attn_out_proj_bias4, alloc2115) + R.vm.kill_object(reshape1112) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_7_encoder_attn_out_proj_bias4) + gv2797: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2116: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2797, R.dtype("float16")) + cls.add5(alloc2111, alloc2115, alloc2116) + R.vm.kill_object(alloc2111) + R.vm.kill_object(alloc2115) + model_decoder_layers_7_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[679] + model_decoder_layers_7_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[680] + gv2798: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2117: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2798, R.dtype("float16")) + cls.layer_norm2(alloc2116, model_decoder_layers_7_final_layer_norm_weight4, model_decoder_layers_7_final_layer_norm_bias4, alloc2117) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_7_final_layer_norm_bias4) + model_decoder_layers_7_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[675] + model_decoder_layers_7_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[676] + gv2799: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2118: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2799, R.dtype("float16")) + _2117: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_7_fc1_weight4, alloc2117, model_decoder_layers_7_fc1_bias4, alloc2118) + R.vm.kill_object(alloc2117) + R.vm.kill_object(model_decoder_layers_7_fc1_weight4) + R.vm.kill_object(model_decoder_layers_7_fc1_bias4) + model_decoder_layers_7_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[677] + model_decoder_layers_7_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[678] + gv2800: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2119: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2800, R.dtype("float16")) + _2118: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_7_fc2_weight4, alloc2118, model_decoder_layers_7_fc2_bias4, alloc2119) + R.vm.kill_object(alloc2118) + R.vm.kill_object(model_decoder_layers_7_fc2_weight4) + R.vm.kill_object(model_decoder_layers_7_fc2_bias4) + gv2801: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2120: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2801, R.dtype("float16")) + cls.add5(alloc2116, alloc2119, alloc2120) + R.vm.kill_object(alloc2116) + R.vm.kill_object(alloc2119) + model_decoder_layers_8_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[688] + model_decoder_layers_8_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[689] + gv2802: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2121: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2802, R.dtype("float16")) + cls.layer_norm2(alloc2120, model_decoder_layers_8_self_attn_layer_norm_weight4, model_decoder_layers_8_self_attn_layer_norm_bias4, alloc2121) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_8_self_attn_layer_norm_bias4) + model_decoder_layers_8_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[684] + model_decoder_layers_8_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[685] + gv2803: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2122: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2803, R.dtype("float16")) + _2121: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_q_proj_weight4, alloc2121, model_decoder_layers_8_self_attn_q_proj_bias4, alloc2122) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_8_self_attn_q_proj_bias4) + gv2804: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1113: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2122, gv2804, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2122) + model_decoder_layers_8_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[681] + gv2805: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2123: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2805, R.dtype("float16")) + _2122: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_8_self_attn_k_proj_weight4, alloc2121, alloc2123) + R.vm.kill_object(model_decoder_layers_8_self_attn_k_proj_weight4) + gv2806: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1114: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2123, gv2806, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2123) + model_decoder_layers_8_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[682] + model_decoder_layers_8_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[683] + gv2807: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2124: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2807, R.dtype("float16")) + _2123: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_v_proj_weight4, alloc2121, model_decoder_layers_8_self_attn_v_proj_bias4, alloc2124) + R.vm.kill_object(alloc2121) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_8_self_attn_v_proj_bias4) + gv2808: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1115: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2124, gv2808, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2124) + gv2809: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2125: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2809, R.dtype("float16")) + cls.concatenate1(reshape1113, reshape1114, reshape1115, alloc2125) + R.vm.kill_object(reshape1113) + R.vm.kill_object(reshape1114) + R.vm.kill_object(reshape1115) + gv2810: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1116: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2125, gv2810, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2125) + gv2811: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2126: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2811, R.dtype("float16")) + _2125: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape1116, alloc2126) + R.vm.kill_object(reshape1116) + gv2812: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1117: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2126, gv2812, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2126) + gv2813: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1118: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1117, gv2813, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1117) + model_decoder_layers_8_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[686] + model_decoder_layers_8_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[687] + gv2814: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2127: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2814, R.dtype("float16")) + _2126: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_self_attn_out_proj_weight4, reshape1118, model_decoder_layers_8_self_attn_out_proj_bias4, alloc2127) + R.vm.kill_object(reshape1118) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_8_self_attn_out_proj_bias4) + gv2815: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2128: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2815, R.dtype("float16")) + cls.add5(alloc2120, alloc2127, alloc2128) + R.vm.kill_object(alloc2120) + R.vm.kill_object(alloc2127) + model_decoder_layers_8_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[697] + model_decoder_layers_8_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[698] + gv2816: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2129: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2816, R.dtype("float16")) + cls.layer_norm2(alloc2128, model_decoder_layers_8_encoder_attn_layer_norm_weight4, model_decoder_layers_8_encoder_attn_layer_norm_bias4, alloc2129) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_layer_norm_bias4) + model_decoder_layers_8_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[693] + model_decoder_layers_8_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[694] + gv2817: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2130: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2817, R.dtype("float16")) + _2129: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_q_proj_weight4, alloc2129, model_decoder_layers_8_encoder_attn_q_proj_bias4, alloc2130) + R.vm.kill_object(alloc2129) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_q_proj_bias4) + gv2818: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1119: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2130, gv2818, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2130) + gv2819: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1120: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1119, gv2819, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1119) + gv2820: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2131: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2820, R.dtype("float16")) + _2130: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(8), R.prim_value(T.float32(1)), reshape1120, alloc2131) + R.vm.kill_object(reshape1120) + gv2821: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1121: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2131, gv2821, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2131) + gv2822: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1122: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1121, gv2822, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1121) + model_decoder_layers_8_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[695] + model_decoder_layers_8_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[696] + gv2823: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2132: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2823, R.dtype("float16")) + _2131: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_8_encoder_attn_out_proj_weight4, reshape1122, model_decoder_layers_8_encoder_attn_out_proj_bias4, alloc2132) + R.vm.kill_object(reshape1122) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_8_encoder_attn_out_proj_bias4) + gv2824: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2133: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2824, R.dtype("float16")) + cls.add5(alloc2128, alloc2132, alloc2133) + R.vm.kill_object(alloc2128) + R.vm.kill_object(alloc2132) + model_decoder_layers_8_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[703] + model_decoder_layers_8_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[704] + gv2825: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2134: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2825, R.dtype("float16")) + cls.layer_norm2(alloc2133, model_decoder_layers_8_final_layer_norm_weight4, model_decoder_layers_8_final_layer_norm_bias4, alloc2134) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_8_final_layer_norm_bias4) + model_decoder_layers_8_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[699] + model_decoder_layers_8_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[700] + gv2826: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2135: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2826, R.dtype("float16")) + _2134: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_8_fc1_weight4, alloc2134, model_decoder_layers_8_fc1_bias4, alloc2135) + R.vm.kill_object(alloc2134) + R.vm.kill_object(model_decoder_layers_8_fc1_weight4) + R.vm.kill_object(model_decoder_layers_8_fc1_bias4) + model_decoder_layers_8_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[701] + model_decoder_layers_8_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[702] + gv2827: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2136: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2827, R.dtype("float16")) + _2135: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_8_fc2_weight4, alloc2135, model_decoder_layers_8_fc2_bias4, alloc2136) + R.vm.kill_object(alloc2135) + R.vm.kill_object(model_decoder_layers_8_fc2_weight4) + R.vm.kill_object(model_decoder_layers_8_fc2_bias4) + gv2828: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2137: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2828, R.dtype("float16")) + cls.add5(alloc2133, alloc2136, alloc2137) + R.vm.kill_object(alloc2133) + R.vm.kill_object(alloc2136) + model_decoder_layers_9_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[712] + model_decoder_layers_9_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[713] + gv2829: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2138: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2829, R.dtype("float16")) + cls.layer_norm2(alloc2137, model_decoder_layers_9_self_attn_layer_norm_weight4, model_decoder_layers_9_self_attn_layer_norm_bias4, alloc2138) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_9_self_attn_layer_norm_bias4) + model_decoder_layers_9_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[708] + model_decoder_layers_9_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[709] + gv2830: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2139: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2830, R.dtype("float16")) + _2138: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_q_proj_weight4, alloc2138, model_decoder_layers_9_self_attn_q_proj_bias4, alloc2139) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_9_self_attn_q_proj_bias4) + gv2831: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1123: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2139, gv2831, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2139) + model_decoder_layers_9_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[705] + gv2832: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2140: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2832, R.dtype("float16")) + _2139: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_9_self_attn_k_proj_weight4, alloc2138, alloc2140) + R.vm.kill_object(model_decoder_layers_9_self_attn_k_proj_weight4) + gv2833: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1124: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2140, gv2833, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2140) + model_decoder_layers_9_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[706] + model_decoder_layers_9_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[707] + gv2834: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2141: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2834, R.dtype("float16")) + _2140: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_v_proj_weight4, alloc2138, model_decoder_layers_9_self_attn_v_proj_bias4, alloc2141) + R.vm.kill_object(alloc2138) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_9_self_attn_v_proj_bias4) + gv2835: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1125: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2141, gv2835, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2141) + gv2836: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2142: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2836, R.dtype("float16")) + cls.concatenate1(reshape1123, reshape1124, reshape1125, alloc2142) + R.vm.kill_object(reshape1123) + R.vm.kill_object(reshape1124) + R.vm.kill_object(reshape1125) + gv2837: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1126: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2142, gv2837, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2142) + gv2838: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2143: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2838, R.dtype("float16")) + _2142: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape1126, alloc2143) + R.vm.kill_object(reshape1126) + gv2839: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1127: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2143, gv2839, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2143) + gv2840: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1128: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1127, gv2840, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1127) + model_decoder_layers_9_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[710] + model_decoder_layers_9_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[711] + gv2841: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2144: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2841, R.dtype("float16")) + _2143: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_self_attn_out_proj_weight4, reshape1128, model_decoder_layers_9_self_attn_out_proj_bias4, alloc2144) + R.vm.kill_object(reshape1128) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_9_self_attn_out_proj_bias4) + gv2842: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2145: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2842, R.dtype("float16")) + cls.add5(alloc2137, alloc2144, alloc2145) + R.vm.kill_object(alloc2137) + R.vm.kill_object(alloc2144) + model_decoder_layers_9_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[721] + model_decoder_layers_9_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[722] + gv2843: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2146: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2843, R.dtype("float16")) + cls.layer_norm2(alloc2145, model_decoder_layers_9_encoder_attn_layer_norm_weight4, model_decoder_layers_9_encoder_attn_layer_norm_bias4, alloc2146) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_layer_norm_bias4) + model_decoder_layers_9_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[717] + model_decoder_layers_9_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[718] + gv2844: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2147: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2844, R.dtype("float16")) + _2146: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_q_proj_weight4, alloc2146, model_decoder_layers_9_encoder_attn_q_proj_bias4, alloc2147) + R.vm.kill_object(alloc2146) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_q_proj_bias4) + gv2845: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1129: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2147, gv2845, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2147) + gv2846: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1130: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1129, gv2846, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1129) + gv2847: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2148: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2847, R.dtype("float16")) + _2147: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(9), R.prim_value(T.float32(1)), reshape1130, alloc2148) + R.vm.kill_object(reshape1130) + gv2848: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1131: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2148, gv2848, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2148) + gv2849: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1132: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1131, gv2849, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1131) + model_decoder_layers_9_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[719] + model_decoder_layers_9_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[720] + gv2850: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2149: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2850, R.dtype("float16")) + _2148: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_9_encoder_attn_out_proj_weight4, reshape1132, model_decoder_layers_9_encoder_attn_out_proj_bias4, alloc2149) + R.vm.kill_object(reshape1132) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_9_encoder_attn_out_proj_bias4) + gv2851: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2150: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2851, R.dtype("float16")) + cls.add5(alloc2145, alloc2149, alloc2150) + R.vm.kill_object(alloc2145) + R.vm.kill_object(alloc2149) + model_decoder_layers_9_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[727] + model_decoder_layers_9_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[728] + gv2852: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2151: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2852, R.dtype("float16")) + cls.layer_norm2(alloc2150, model_decoder_layers_9_final_layer_norm_weight4, model_decoder_layers_9_final_layer_norm_bias4, alloc2151) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_9_final_layer_norm_bias4) + model_decoder_layers_9_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[723] + model_decoder_layers_9_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[724] + gv2853: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2152: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2853, R.dtype("float16")) + _2151: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_9_fc1_weight4, alloc2151, model_decoder_layers_9_fc1_bias4, alloc2152) + R.vm.kill_object(alloc2151) + R.vm.kill_object(model_decoder_layers_9_fc1_weight4) + R.vm.kill_object(model_decoder_layers_9_fc1_bias4) + model_decoder_layers_9_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[725] + model_decoder_layers_9_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[726] + gv2854: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2153: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2854, R.dtype("float16")) + _2152: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_9_fc2_weight4, alloc2152, model_decoder_layers_9_fc2_bias4, alloc2153) + R.vm.kill_object(alloc2152) + R.vm.kill_object(model_decoder_layers_9_fc2_weight4) + R.vm.kill_object(model_decoder_layers_9_fc2_bias4) + gv2855: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2154: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2855, R.dtype("float16")) + cls.add5(alloc2150, alloc2153, alloc2154) + R.vm.kill_object(alloc2150) + R.vm.kill_object(alloc2153) + model_decoder_layers_10_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[736] + model_decoder_layers_10_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[737] + gv2856: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2155: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2856, R.dtype("float16")) + cls.layer_norm2(alloc2154, model_decoder_layers_10_self_attn_layer_norm_weight4, model_decoder_layers_10_self_attn_layer_norm_bias4, alloc2155) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_10_self_attn_layer_norm_bias4) + model_decoder_layers_10_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[732] + model_decoder_layers_10_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[733] + gv2857: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2156: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2857, R.dtype("float16")) + _2155: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_q_proj_weight4, alloc2155, model_decoder_layers_10_self_attn_q_proj_bias4, alloc2156) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_10_self_attn_q_proj_bias4) + gv2858: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1133: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2156, gv2858, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2156) + model_decoder_layers_10_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[729] + gv2859: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2157: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2859, R.dtype("float16")) + _2156: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_10_self_attn_k_proj_weight4, alloc2155, alloc2157) + R.vm.kill_object(model_decoder_layers_10_self_attn_k_proj_weight4) + gv2860: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1134: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2157, gv2860, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2157) + model_decoder_layers_10_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[730] + model_decoder_layers_10_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[731] + gv2861: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2158: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2861, R.dtype("float16")) + _2157: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_v_proj_weight4, alloc2155, model_decoder_layers_10_self_attn_v_proj_bias4, alloc2158) + R.vm.kill_object(alloc2155) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_10_self_attn_v_proj_bias4) + gv2862: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1135: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2158, gv2862, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2158) + gv2863: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2159: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2863, R.dtype("float16")) + cls.concatenate1(reshape1133, reshape1134, reshape1135, alloc2159) + R.vm.kill_object(reshape1133) + R.vm.kill_object(reshape1134) + R.vm.kill_object(reshape1135) + gv2864: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1136: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2159, gv2864, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2159) + gv2865: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2160: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2865, R.dtype("float16")) + _2159: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape1136, alloc2160) + R.vm.kill_object(reshape1136) + gv2866: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1137: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2160, gv2866, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2160) + gv2867: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1138: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1137, gv2867, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1137) + model_decoder_layers_10_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[734] + model_decoder_layers_10_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[735] + gv2868: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2161: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2868, R.dtype("float16")) + _2160: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_self_attn_out_proj_weight4, reshape1138, model_decoder_layers_10_self_attn_out_proj_bias4, alloc2161) + R.vm.kill_object(reshape1138) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_10_self_attn_out_proj_bias4) + gv2869: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2162: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2869, R.dtype("float16")) + cls.add5(alloc2154, alloc2161, alloc2162) + R.vm.kill_object(alloc2154) + R.vm.kill_object(alloc2161) + model_decoder_layers_10_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[745] + model_decoder_layers_10_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[746] + gv2870: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2163: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2870, R.dtype("float16")) + cls.layer_norm2(alloc2162, model_decoder_layers_10_encoder_attn_layer_norm_weight4, model_decoder_layers_10_encoder_attn_layer_norm_bias4, alloc2163) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_layer_norm_bias4) + model_decoder_layers_10_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[741] + model_decoder_layers_10_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[742] + gv2871: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2164: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2871, R.dtype("float16")) + _2163: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_q_proj_weight4, alloc2163, model_decoder_layers_10_encoder_attn_q_proj_bias4, alloc2164) + R.vm.kill_object(alloc2163) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_q_proj_bias4) + gv2872: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1139: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2164, gv2872, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2164) + gv2873: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1140: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1139, gv2873, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1139) + gv2874: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2165: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2874, R.dtype("float16")) + _2164: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(10), R.prim_value(T.float32(1)), reshape1140, alloc2165) + R.vm.kill_object(reshape1140) + gv2875: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1141: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2165, gv2875, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2165) + gv2876: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1142: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1141, gv2876, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1141) + model_decoder_layers_10_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[743] + model_decoder_layers_10_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[744] + gv2877: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2166: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2877, R.dtype("float16")) + _2165: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_10_encoder_attn_out_proj_weight4, reshape1142, model_decoder_layers_10_encoder_attn_out_proj_bias4, alloc2166) + R.vm.kill_object(reshape1142) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_10_encoder_attn_out_proj_bias4) + gv2878: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2167: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2878, R.dtype("float16")) + cls.add5(alloc2162, alloc2166, alloc2167) + R.vm.kill_object(alloc2162) + R.vm.kill_object(alloc2166) + model_decoder_layers_10_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[751] + model_decoder_layers_10_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[752] + gv2879: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2168: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2879, R.dtype("float16")) + cls.layer_norm2(alloc2167, model_decoder_layers_10_final_layer_norm_weight4, model_decoder_layers_10_final_layer_norm_bias4, alloc2168) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_10_final_layer_norm_bias4) + model_decoder_layers_10_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[747] + model_decoder_layers_10_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[748] + gv2880: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2169: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2880, R.dtype("float16")) + _2168: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_10_fc1_weight4, alloc2168, model_decoder_layers_10_fc1_bias4, alloc2169) + R.vm.kill_object(alloc2168) + R.vm.kill_object(model_decoder_layers_10_fc1_weight4) + R.vm.kill_object(model_decoder_layers_10_fc1_bias4) + model_decoder_layers_10_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[749] + model_decoder_layers_10_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[750] + gv2881: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2170: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2881, R.dtype("float16")) + _2169: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_10_fc2_weight4, alloc2169, model_decoder_layers_10_fc2_bias4, alloc2170) + R.vm.kill_object(alloc2169) + R.vm.kill_object(model_decoder_layers_10_fc2_weight4) + R.vm.kill_object(model_decoder_layers_10_fc2_bias4) + gv2882: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2171: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2882, R.dtype("float16")) + cls.add5(alloc2167, alloc2170, alloc2171) + R.vm.kill_object(alloc2167) + R.vm.kill_object(alloc2170) + model_decoder_layers_11_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[760] + model_decoder_layers_11_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[761] + gv2883: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2172: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2883, R.dtype("float16")) + cls.layer_norm2(alloc2171, model_decoder_layers_11_self_attn_layer_norm_weight4, model_decoder_layers_11_self_attn_layer_norm_bias4, alloc2172) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_11_self_attn_layer_norm_bias4) + model_decoder_layers_11_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[756] + model_decoder_layers_11_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[757] + gv2884: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2173: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2884, R.dtype("float16")) + _2172: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_q_proj_weight4, alloc2172, model_decoder_layers_11_self_attn_q_proj_bias4, alloc2173) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_11_self_attn_q_proj_bias4) + gv2885: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1143: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2173, gv2885, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2173) + model_decoder_layers_11_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[753] + gv2886: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2174: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2886, R.dtype("float16")) + _2173: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_11_self_attn_k_proj_weight4, alloc2172, alloc2174) + R.vm.kill_object(model_decoder_layers_11_self_attn_k_proj_weight4) + gv2887: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1144: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2174, gv2887, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2174) + model_decoder_layers_11_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[754] + model_decoder_layers_11_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[755] + gv2888: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2175: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2888, R.dtype("float16")) + _2174: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_v_proj_weight4, alloc2172, model_decoder_layers_11_self_attn_v_proj_bias4, alloc2175) + R.vm.kill_object(alloc2172) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_11_self_attn_v_proj_bias4) + gv2889: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1145: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2175, gv2889, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2175) + gv2890: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2176: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2890, R.dtype("float16")) + cls.concatenate1(reshape1143, reshape1144, reshape1145, alloc2176) + R.vm.kill_object(reshape1143) + R.vm.kill_object(reshape1144) + R.vm.kill_object(reshape1145) + gv2891: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1146: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2176, gv2891, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2176) + gv2892: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2177: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2892, R.dtype("float16")) + _2176: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape1146, alloc2177) + R.vm.kill_object(reshape1146) + gv2893: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1147: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2177, gv2893, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2177) + gv2894: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1148: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1147, gv2894, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1147) + model_decoder_layers_11_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[758] + model_decoder_layers_11_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[759] + gv2895: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2178: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2895, R.dtype("float16")) + _2177: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_self_attn_out_proj_weight4, reshape1148, model_decoder_layers_11_self_attn_out_proj_bias4, alloc2178) + R.vm.kill_object(reshape1148) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_11_self_attn_out_proj_bias4) + gv2896: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2179: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2896, R.dtype("float16")) + cls.add5(alloc2171, alloc2178, alloc2179) + R.vm.kill_object(alloc2171) + R.vm.kill_object(alloc2178) + model_decoder_layers_11_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[769] + model_decoder_layers_11_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[770] + gv2897: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2180: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2897, R.dtype("float16")) + cls.layer_norm2(alloc2179, model_decoder_layers_11_encoder_attn_layer_norm_weight4, model_decoder_layers_11_encoder_attn_layer_norm_bias4, alloc2180) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_layer_norm_bias4) + model_decoder_layers_11_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[765] + model_decoder_layers_11_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[766] + gv2898: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2181: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2898, R.dtype("float16")) + _2180: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_q_proj_weight4, alloc2180, model_decoder_layers_11_encoder_attn_q_proj_bias4, alloc2181) + R.vm.kill_object(alloc2180) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_q_proj_bias4) + gv2899: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1149: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2181, gv2899, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2181) + gv2900: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1150: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1149, gv2900, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1149) + gv2901: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2182: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2901, R.dtype("float16")) + _2181: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(11), R.prim_value(T.float32(1)), reshape1150, alloc2182) + R.vm.kill_object(reshape1150) + gv2902: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1151: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2182, gv2902, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2182) + gv2903: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1152: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1151, gv2903, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1151) + model_decoder_layers_11_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[767] + model_decoder_layers_11_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[768] + gv2904: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2183: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2904, R.dtype("float16")) + _2182: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_11_encoder_attn_out_proj_weight4, reshape1152, model_decoder_layers_11_encoder_attn_out_proj_bias4, alloc2183) + R.vm.kill_object(reshape1152) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_11_encoder_attn_out_proj_bias4) + gv2905: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2184: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2905, R.dtype("float16")) + cls.add5(alloc2179, alloc2183, alloc2184) + R.vm.kill_object(alloc2179) + R.vm.kill_object(alloc2183) + model_decoder_layers_11_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[775] + model_decoder_layers_11_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[776] + gv2906: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2185: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2906, R.dtype("float16")) + cls.layer_norm2(alloc2184, model_decoder_layers_11_final_layer_norm_weight4, model_decoder_layers_11_final_layer_norm_bias4, alloc2185) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_11_final_layer_norm_bias4) + model_decoder_layers_11_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[771] + model_decoder_layers_11_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[772] + gv2907: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2186: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2907, R.dtype("float16")) + _2185: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_11_fc1_weight4, alloc2185, model_decoder_layers_11_fc1_bias4, alloc2186) + R.vm.kill_object(alloc2185) + R.vm.kill_object(model_decoder_layers_11_fc1_weight4) + R.vm.kill_object(model_decoder_layers_11_fc1_bias4) + model_decoder_layers_11_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[773] + model_decoder_layers_11_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[774] + gv2908: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2187: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2908, R.dtype("float16")) + _2186: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_11_fc2_weight4, alloc2186, model_decoder_layers_11_fc2_bias4, alloc2187) + R.vm.kill_object(alloc2186) + R.vm.kill_object(model_decoder_layers_11_fc2_weight4) + R.vm.kill_object(model_decoder_layers_11_fc2_bias4) + gv2909: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2188: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2909, R.dtype("float16")) + cls.add5(alloc2184, alloc2187, alloc2188) + R.vm.kill_object(alloc2184) + R.vm.kill_object(alloc2187) + model_decoder_layers_12_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[784] + model_decoder_layers_12_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[785] + gv2910: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2189: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2910, R.dtype("float16")) + cls.layer_norm2(alloc2188, model_decoder_layers_12_self_attn_layer_norm_weight4, model_decoder_layers_12_self_attn_layer_norm_bias4, alloc2189) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_12_self_attn_layer_norm_bias4) + model_decoder_layers_12_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[780] + model_decoder_layers_12_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[781] + gv2911: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2190: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2911, R.dtype("float16")) + _2189: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_q_proj_weight4, alloc2189, model_decoder_layers_12_self_attn_q_proj_bias4, alloc2190) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_12_self_attn_q_proj_bias4) + gv2912: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1153: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2190, gv2912, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2190) + model_decoder_layers_12_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[777] + gv2913: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2191: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2913, R.dtype("float16")) + _2190: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_12_self_attn_k_proj_weight4, alloc2189, alloc2191) + R.vm.kill_object(model_decoder_layers_12_self_attn_k_proj_weight4) + gv2914: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1154: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2191, gv2914, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2191) + model_decoder_layers_12_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[778] + model_decoder_layers_12_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[779] + gv2915: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2192: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2915, R.dtype("float16")) + _2191: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_v_proj_weight4, alloc2189, model_decoder_layers_12_self_attn_v_proj_bias4, alloc2192) + R.vm.kill_object(alloc2189) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_12_self_attn_v_proj_bias4) + gv2916: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1155: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2192, gv2916, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2192) + gv2917: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2193: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2917, R.dtype("float16")) + cls.concatenate1(reshape1153, reshape1154, reshape1155, alloc2193) + R.vm.kill_object(reshape1153) + R.vm.kill_object(reshape1154) + R.vm.kill_object(reshape1155) + gv2918: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1156: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2193, gv2918, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2193) + gv2919: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2194: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2919, R.dtype("float16")) + _2193: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape1156, alloc2194) + R.vm.kill_object(reshape1156) + gv2920: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1157: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2194, gv2920, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2194) + gv2921: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1158: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1157, gv2921, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1157) + model_decoder_layers_12_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[782] + model_decoder_layers_12_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[783] + gv2922: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2195: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2922, R.dtype("float16")) + _2194: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_self_attn_out_proj_weight4, reshape1158, model_decoder_layers_12_self_attn_out_proj_bias4, alloc2195) + R.vm.kill_object(reshape1158) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_12_self_attn_out_proj_bias4) + gv2923: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2196: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2923, R.dtype("float16")) + cls.add5(alloc2188, alloc2195, alloc2196) + R.vm.kill_object(alloc2188) + R.vm.kill_object(alloc2195) + model_decoder_layers_12_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[793] + model_decoder_layers_12_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[794] + gv2924: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2197: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2924, R.dtype("float16")) + cls.layer_norm2(alloc2196, model_decoder_layers_12_encoder_attn_layer_norm_weight4, model_decoder_layers_12_encoder_attn_layer_norm_bias4, alloc2197) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_layer_norm_bias4) + model_decoder_layers_12_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[789] + model_decoder_layers_12_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[790] + gv2925: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2198: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2925, R.dtype("float16")) + _2197: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_q_proj_weight4, alloc2197, model_decoder_layers_12_encoder_attn_q_proj_bias4, alloc2198) + R.vm.kill_object(alloc2197) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_q_proj_bias4) + gv2926: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1159: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2198, gv2926, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2198) + gv2927: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1160: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1159, gv2927, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1159) + gv2928: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2199: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2928, R.dtype("float16")) + _2198: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(12), R.prim_value(T.float32(1)), reshape1160, alloc2199) + R.vm.kill_object(reshape1160) + gv2929: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1161: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2199, gv2929, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2199) + gv2930: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1162: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1161, gv2930, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1161) + model_decoder_layers_12_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[791] + model_decoder_layers_12_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[792] + gv2931: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2200: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2931, R.dtype("float16")) + _2199: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_12_encoder_attn_out_proj_weight4, reshape1162, model_decoder_layers_12_encoder_attn_out_proj_bias4, alloc2200) + R.vm.kill_object(reshape1162) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_12_encoder_attn_out_proj_bias4) + gv2932: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2201: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2932, R.dtype("float16")) + cls.add5(alloc2196, alloc2200, alloc2201) + R.vm.kill_object(alloc2196) + R.vm.kill_object(alloc2200) + model_decoder_layers_12_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[799] + model_decoder_layers_12_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[800] + gv2933: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2202: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2933, R.dtype("float16")) + cls.layer_norm2(alloc2201, model_decoder_layers_12_final_layer_norm_weight4, model_decoder_layers_12_final_layer_norm_bias4, alloc2202) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_12_final_layer_norm_bias4) + model_decoder_layers_12_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[795] + model_decoder_layers_12_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[796] + gv2934: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2203: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2934, R.dtype("float16")) + _2202: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_12_fc1_weight4, alloc2202, model_decoder_layers_12_fc1_bias4, alloc2203) + R.vm.kill_object(alloc2202) + R.vm.kill_object(model_decoder_layers_12_fc1_weight4) + R.vm.kill_object(model_decoder_layers_12_fc1_bias4) + model_decoder_layers_12_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[797] + model_decoder_layers_12_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[798] + gv2935: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2204: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2935, R.dtype("float16")) + _2203: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_12_fc2_weight4, alloc2203, model_decoder_layers_12_fc2_bias4, alloc2204) + R.vm.kill_object(alloc2203) + R.vm.kill_object(model_decoder_layers_12_fc2_weight4) + R.vm.kill_object(model_decoder_layers_12_fc2_bias4) + gv2936: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2205: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2936, R.dtype("float16")) + cls.add5(alloc2201, alloc2204, alloc2205) + R.vm.kill_object(alloc2201) + R.vm.kill_object(alloc2204) + model_decoder_layers_13_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[808] + model_decoder_layers_13_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[809] + gv2937: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2206: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2937, R.dtype("float16")) + cls.layer_norm2(alloc2205, model_decoder_layers_13_self_attn_layer_norm_weight4, model_decoder_layers_13_self_attn_layer_norm_bias4, alloc2206) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_13_self_attn_layer_norm_bias4) + model_decoder_layers_13_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[804] + model_decoder_layers_13_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[805] + gv2938: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2207: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2938, R.dtype("float16")) + _2206: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_q_proj_weight4, alloc2206, model_decoder_layers_13_self_attn_q_proj_bias4, alloc2207) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_13_self_attn_q_proj_bias4) + gv2939: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1163: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2207, gv2939, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2207) + model_decoder_layers_13_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[801] + gv2940: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2208: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2940, R.dtype("float16")) + _2207: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_13_self_attn_k_proj_weight4, alloc2206, alloc2208) + R.vm.kill_object(model_decoder_layers_13_self_attn_k_proj_weight4) + gv2941: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1164: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2208, gv2941, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2208) + model_decoder_layers_13_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[802] + model_decoder_layers_13_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[803] + gv2942: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2209: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2942, R.dtype("float16")) + _2208: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_v_proj_weight4, alloc2206, model_decoder_layers_13_self_attn_v_proj_bias4, alloc2209) + R.vm.kill_object(alloc2206) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_13_self_attn_v_proj_bias4) + gv2943: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1165: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2209, gv2943, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2209) + gv2944: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2210: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2944, R.dtype("float16")) + cls.concatenate1(reshape1163, reshape1164, reshape1165, alloc2210) + R.vm.kill_object(reshape1163) + R.vm.kill_object(reshape1164) + R.vm.kill_object(reshape1165) + gv2945: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1166: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2210, gv2945, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2210) + gv2946: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2211: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2946, R.dtype("float16")) + _2210: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape1166, alloc2211) + R.vm.kill_object(reshape1166) + gv2947: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1167: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2211, gv2947, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2211) + gv2948: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1168: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1167, gv2948, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1167) + model_decoder_layers_13_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[806] + model_decoder_layers_13_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[807] + gv2949: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2212: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2949, R.dtype("float16")) + _2211: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_self_attn_out_proj_weight4, reshape1168, model_decoder_layers_13_self_attn_out_proj_bias4, alloc2212) + R.vm.kill_object(reshape1168) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_13_self_attn_out_proj_bias4) + gv2950: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2213: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2950, R.dtype("float16")) + cls.add5(alloc2205, alloc2212, alloc2213) + R.vm.kill_object(alloc2205) + R.vm.kill_object(alloc2212) + model_decoder_layers_13_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[817] + model_decoder_layers_13_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[818] + gv2951: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2214: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2951, R.dtype("float16")) + cls.layer_norm2(alloc2213, model_decoder_layers_13_encoder_attn_layer_norm_weight4, model_decoder_layers_13_encoder_attn_layer_norm_bias4, alloc2214) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_layer_norm_bias4) + model_decoder_layers_13_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[813] + model_decoder_layers_13_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[814] + gv2952: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2215: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2952, R.dtype("float16")) + _2214: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_q_proj_weight4, alloc2214, model_decoder_layers_13_encoder_attn_q_proj_bias4, alloc2215) + R.vm.kill_object(alloc2214) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_q_proj_bias4) + gv2953: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1169: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2215, gv2953, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2215) + gv2954: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1170: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1169, gv2954, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1169) + gv2955: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2216: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2955, R.dtype("float16")) + _2215: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(13), R.prim_value(T.float32(1)), reshape1170, alloc2216) + R.vm.kill_object(reshape1170) + gv2956: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1171: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2216, gv2956, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2216) + gv2957: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1172: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1171, gv2957, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1171) + model_decoder_layers_13_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[815] + model_decoder_layers_13_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[816] + gv2958: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2217: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2958, R.dtype("float16")) + _2216: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_13_encoder_attn_out_proj_weight4, reshape1172, model_decoder_layers_13_encoder_attn_out_proj_bias4, alloc2217) + R.vm.kill_object(reshape1172) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_13_encoder_attn_out_proj_bias4) + gv2959: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2218: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2959, R.dtype("float16")) + cls.add5(alloc2213, alloc2217, alloc2218) + R.vm.kill_object(alloc2213) + R.vm.kill_object(alloc2217) + model_decoder_layers_13_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[823] + model_decoder_layers_13_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[824] + gv2960: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2219: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2960, R.dtype("float16")) + cls.layer_norm2(alloc2218, model_decoder_layers_13_final_layer_norm_weight4, model_decoder_layers_13_final_layer_norm_bias4, alloc2219) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_13_final_layer_norm_bias4) + model_decoder_layers_13_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[819] + model_decoder_layers_13_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[820] + gv2961: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2220: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2961, R.dtype("float16")) + _2219: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_13_fc1_weight4, alloc2219, model_decoder_layers_13_fc1_bias4, alloc2220) + R.vm.kill_object(alloc2219) + R.vm.kill_object(model_decoder_layers_13_fc1_weight4) + R.vm.kill_object(model_decoder_layers_13_fc1_bias4) + model_decoder_layers_13_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[821] + model_decoder_layers_13_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[822] + gv2962: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2221: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2962, R.dtype("float16")) + _2220: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_13_fc2_weight4, alloc2220, model_decoder_layers_13_fc2_bias4, alloc2221) + R.vm.kill_object(alloc2220) + R.vm.kill_object(model_decoder_layers_13_fc2_weight4) + R.vm.kill_object(model_decoder_layers_13_fc2_bias4) + gv2963: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2222: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2963, R.dtype("float16")) + cls.add5(alloc2218, alloc2221, alloc2222) + R.vm.kill_object(alloc2218) + R.vm.kill_object(alloc2221) + model_decoder_layers_14_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[832] + model_decoder_layers_14_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[833] + gv2964: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2223: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2964, R.dtype("float16")) + cls.layer_norm2(alloc2222, model_decoder_layers_14_self_attn_layer_norm_weight4, model_decoder_layers_14_self_attn_layer_norm_bias4, alloc2223) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_14_self_attn_layer_norm_bias4) + model_decoder_layers_14_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[828] + model_decoder_layers_14_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[829] + gv2965: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2224: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2965, R.dtype("float16")) + _2223: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_q_proj_weight4, alloc2223, model_decoder_layers_14_self_attn_q_proj_bias4, alloc2224) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_14_self_attn_q_proj_bias4) + gv2966: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1173: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2224, gv2966, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2224) + model_decoder_layers_14_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[825] + gv2967: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2225: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2967, R.dtype("float16")) + _2224: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_14_self_attn_k_proj_weight4, alloc2223, alloc2225) + R.vm.kill_object(model_decoder_layers_14_self_attn_k_proj_weight4) + gv2968: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1174: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2225, gv2968, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2225) + model_decoder_layers_14_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[826] + model_decoder_layers_14_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[827] + gv2969: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2226: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2969, R.dtype("float16")) + _2225: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_v_proj_weight4, alloc2223, model_decoder_layers_14_self_attn_v_proj_bias4, alloc2226) + R.vm.kill_object(alloc2223) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_14_self_attn_v_proj_bias4) + gv2970: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1175: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2226, gv2970, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2226) + gv2971: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2227: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2971, R.dtype("float16")) + cls.concatenate1(reshape1173, reshape1174, reshape1175, alloc2227) + R.vm.kill_object(reshape1173) + R.vm.kill_object(reshape1174) + R.vm.kill_object(reshape1175) + gv2972: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1176: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2227, gv2972, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2227) + gv2973: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2228: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2973, R.dtype("float16")) + _2227: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape1176, alloc2228) + R.vm.kill_object(reshape1176) + gv2974: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1177: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2228, gv2974, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2228) + gv2975: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1178: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1177, gv2975, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1177) + model_decoder_layers_14_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[830] + model_decoder_layers_14_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[831] + gv2976: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2229: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2976, R.dtype("float16")) + _2228: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_self_attn_out_proj_weight4, reshape1178, model_decoder_layers_14_self_attn_out_proj_bias4, alloc2229) + R.vm.kill_object(reshape1178) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_14_self_attn_out_proj_bias4) + gv2977: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2230: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2977, R.dtype("float16")) + cls.add5(alloc2222, alloc2229, alloc2230) + R.vm.kill_object(alloc2222) + R.vm.kill_object(alloc2229) + model_decoder_layers_14_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[841] + model_decoder_layers_14_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[842] + gv2978: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2231: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2978, R.dtype("float16")) + cls.layer_norm2(alloc2230, model_decoder_layers_14_encoder_attn_layer_norm_weight4, model_decoder_layers_14_encoder_attn_layer_norm_bias4, alloc2231) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_layer_norm_bias4) + model_decoder_layers_14_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[837] + model_decoder_layers_14_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[838] + gv2979: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2232: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2979, R.dtype("float16")) + _2231: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_q_proj_weight4, alloc2231, model_decoder_layers_14_encoder_attn_q_proj_bias4, alloc2232) + R.vm.kill_object(alloc2231) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_q_proj_bias4) + gv2980: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1179: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2232, gv2980, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2232) + gv2981: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1180: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1179, gv2981, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1179) + gv2982: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2233: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2982, R.dtype("float16")) + _2232: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(14), R.prim_value(T.float32(1)), reshape1180, alloc2233) + R.vm.kill_object(reshape1180) + gv2983: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1181: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2233, gv2983, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2233) + gv2984: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1182: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1181, gv2984, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1181) + model_decoder_layers_14_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[839] + model_decoder_layers_14_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[840] + gv2985: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2234: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2985, R.dtype("float16")) + _2233: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_14_encoder_attn_out_proj_weight4, reshape1182, model_decoder_layers_14_encoder_attn_out_proj_bias4, alloc2234) + R.vm.kill_object(reshape1182) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_14_encoder_attn_out_proj_bias4) + gv2986: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2235: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2986, R.dtype("float16")) + cls.add5(alloc2230, alloc2234, alloc2235) + R.vm.kill_object(alloc2230) + R.vm.kill_object(alloc2234) + model_decoder_layers_14_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[847] + model_decoder_layers_14_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[848] + gv2987: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2236: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2987, R.dtype("float16")) + cls.layer_norm2(alloc2235, model_decoder_layers_14_final_layer_norm_weight4, model_decoder_layers_14_final_layer_norm_bias4, alloc2236) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_14_final_layer_norm_bias4) + model_decoder_layers_14_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[843] + model_decoder_layers_14_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[844] + gv2988: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2237: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2988, R.dtype("float16")) + _2236: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_14_fc1_weight4, alloc2236, model_decoder_layers_14_fc1_bias4, alloc2237) + R.vm.kill_object(alloc2236) + R.vm.kill_object(model_decoder_layers_14_fc1_weight4) + R.vm.kill_object(model_decoder_layers_14_fc1_bias4) + model_decoder_layers_14_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[845] + model_decoder_layers_14_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[846] + gv2989: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2238: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2989, R.dtype("float16")) + _2237: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_14_fc2_weight4, alloc2237, model_decoder_layers_14_fc2_bias4, alloc2238) + R.vm.kill_object(alloc2237) + R.vm.kill_object(model_decoder_layers_14_fc2_weight4) + R.vm.kill_object(model_decoder_layers_14_fc2_bias4) + gv2990: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2239: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv2990, R.dtype("float16")) + cls.add5(alloc2235, alloc2238, alloc2239) + R.vm.kill_object(alloc2235) + R.vm.kill_object(alloc2238) + model_decoder_layers_15_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[856] + model_decoder_layers_15_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[857] + gv2991: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2240: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2991, R.dtype("float16")) + cls.layer_norm2(alloc2239, model_decoder_layers_15_self_attn_layer_norm_weight4, model_decoder_layers_15_self_attn_layer_norm_bias4, alloc2240) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_15_self_attn_layer_norm_bias4) + model_decoder_layers_15_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[852] + model_decoder_layers_15_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[853] + gv2992: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2241: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv2992, R.dtype("float16")) + _2240: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_q_proj_weight4, alloc2240, model_decoder_layers_15_self_attn_q_proj_bias4, alloc2241) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_15_self_attn_q_proj_bias4) + gv2993: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1183: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2241, gv2993, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2241) + model_decoder_layers_15_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[849] + gv2994: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2242: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv2994, R.dtype("float16")) + _2241: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_15_self_attn_k_proj_weight4, alloc2240, alloc2242) + R.vm.kill_object(model_decoder_layers_15_self_attn_k_proj_weight4) + gv2995: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1184: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2242, gv2995, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2242) + model_decoder_layers_15_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[850] + model_decoder_layers_15_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[851] + gv2996: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2243: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv2996, R.dtype("float16")) + _2242: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_v_proj_weight4, alloc2240, model_decoder_layers_15_self_attn_v_proj_bias4, alloc2243) + R.vm.kill_object(alloc2240) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_15_self_attn_v_proj_bias4) + gv2997: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1185: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2243, gv2997, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2243) + gv2998: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2244: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv2998, R.dtype("float16")) + cls.concatenate1(reshape1183, reshape1184, reshape1185, alloc2244) + R.vm.kill_object(reshape1183) + R.vm.kill_object(reshape1184) + R.vm.kill_object(reshape1185) + gv2999: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1186: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2244, gv2999, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2244) + gv3000: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2245: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3000, R.dtype("float16")) + _2244: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape1186, alloc2245) + R.vm.kill_object(reshape1186) + gv3001: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1187: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2245, gv3001, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2245) + gv3002: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1188: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1187, gv3002, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1187) + model_decoder_layers_15_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[854] + model_decoder_layers_15_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[855] + gv3003: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2246: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3003, R.dtype("float16")) + _2245: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_self_attn_out_proj_weight4, reshape1188, model_decoder_layers_15_self_attn_out_proj_bias4, alloc2246) + R.vm.kill_object(reshape1188) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_15_self_attn_out_proj_bias4) + gv3004: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2247: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3004, R.dtype("float16")) + cls.add5(alloc2239, alloc2246, alloc2247) + R.vm.kill_object(alloc2239) + R.vm.kill_object(alloc2246) + model_decoder_layers_15_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[865] + model_decoder_layers_15_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[866] + gv3005: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2248: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3005, R.dtype("float16")) + cls.layer_norm2(alloc2247, model_decoder_layers_15_encoder_attn_layer_norm_weight4, model_decoder_layers_15_encoder_attn_layer_norm_bias4, alloc2248) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_layer_norm_bias4) + model_decoder_layers_15_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[861] + model_decoder_layers_15_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[862] + gv3006: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2249: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3006, R.dtype("float16")) + _2248: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_q_proj_weight4, alloc2248, model_decoder_layers_15_encoder_attn_q_proj_bias4, alloc2249) + R.vm.kill_object(alloc2248) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_q_proj_bias4) + gv3007: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1189: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2249, gv3007, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2249) + gv3008: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1190: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1189, gv3008, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1189) + gv3009: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2250: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3009, R.dtype("float16")) + _2249: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(15), R.prim_value(T.float32(1)), reshape1190, alloc2250) + R.vm.kill_object(reshape1190) + gv3010: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1191: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2250, gv3010, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2250) + gv3011: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1192: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1191, gv3011, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1191) + model_decoder_layers_15_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[863] + model_decoder_layers_15_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[864] + gv3012: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2251: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3012, R.dtype("float16")) + _2250: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_15_encoder_attn_out_proj_weight4, reshape1192, model_decoder_layers_15_encoder_attn_out_proj_bias4, alloc2251) + R.vm.kill_object(reshape1192) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_15_encoder_attn_out_proj_bias4) + gv3013: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2252: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3013, R.dtype("float16")) + cls.add5(alloc2247, alloc2251, alloc2252) + R.vm.kill_object(alloc2247) + R.vm.kill_object(alloc2251) + model_decoder_layers_15_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[871] + model_decoder_layers_15_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[872] + gv3014: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2253: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3014, R.dtype("float16")) + cls.layer_norm2(alloc2252, model_decoder_layers_15_final_layer_norm_weight4, model_decoder_layers_15_final_layer_norm_bias4, alloc2253) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_15_final_layer_norm_bias4) + model_decoder_layers_15_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[867] + model_decoder_layers_15_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[868] + gv3015: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2254: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3015, R.dtype("float16")) + _2253: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_15_fc1_weight4, alloc2253, model_decoder_layers_15_fc1_bias4, alloc2254) + R.vm.kill_object(alloc2253) + R.vm.kill_object(model_decoder_layers_15_fc1_weight4) + R.vm.kill_object(model_decoder_layers_15_fc1_bias4) + model_decoder_layers_15_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[869] + model_decoder_layers_15_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[870] + gv3016: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2255: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3016, R.dtype("float16")) + _2254: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_15_fc2_weight4, alloc2254, model_decoder_layers_15_fc2_bias4, alloc2255) + R.vm.kill_object(alloc2254) + R.vm.kill_object(model_decoder_layers_15_fc2_weight4) + R.vm.kill_object(model_decoder_layers_15_fc2_bias4) + gv3017: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2256: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3017, R.dtype("float16")) + cls.add5(alloc2252, alloc2255, alloc2256) + R.vm.kill_object(alloc2252) + R.vm.kill_object(alloc2255) + model_decoder_layers_16_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[880] + model_decoder_layers_16_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[881] + gv3018: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2257: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3018, R.dtype("float16")) + cls.layer_norm2(alloc2256, model_decoder_layers_16_self_attn_layer_norm_weight4, model_decoder_layers_16_self_attn_layer_norm_bias4, alloc2257) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_16_self_attn_layer_norm_bias4) + model_decoder_layers_16_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[876] + model_decoder_layers_16_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[877] + gv3019: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2258: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3019, R.dtype("float16")) + _2257: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_q_proj_weight4, alloc2257, model_decoder_layers_16_self_attn_q_proj_bias4, alloc2258) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_16_self_attn_q_proj_bias4) + gv3020: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1193: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2258, gv3020, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2258) + model_decoder_layers_16_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[873] + gv3021: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2259: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3021, R.dtype("float16")) + _2258: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_16_self_attn_k_proj_weight4, alloc2257, alloc2259) + R.vm.kill_object(model_decoder_layers_16_self_attn_k_proj_weight4) + gv3022: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1194: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2259, gv3022, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2259) + model_decoder_layers_16_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[874] + model_decoder_layers_16_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[875] + gv3023: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2260: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3023, R.dtype("float16")) + _2259: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_v_proj_weight4, alloc2257, model_decoder_layers_16_self_attn_v_proj_bias4, alloc2260) + R.vm.kill_object(alloc2257) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_16_self_attn_v_proj_bias4) + gv3024: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1195: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2260, gv3024, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2260) + gv3025: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2261: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3025, R.dtype("float16")) + cls.concatenate1(reshape1193, reshape1194, reshape1195, alloc2261) + R.vm.kill_object(reshape1193) + R.vm.kill_object(reshape1194) + R.vm.kill_object(reshape1195) + gv3026: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1196: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2261, gv3026, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2261) + gv3027: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2262: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3027, R.dtype("float16")) + _2261: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape1196, alloc2262) + R.vm.kill_object(reshape1196) + gv3028: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1197: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2262, gv3028, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2262) + gv3029: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1198: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1197, gv3029, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1197) + model_decoder_layers_16_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[878] + model_decoder_layers_16_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[879] + gv3030: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2263: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3030, R.dtype("float16")) + _2262: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_self_attn_out_proj_weight4, reshape1198, model_decoder_layers_16_self_attn_out_proj_bias4, alloc2263) + R.vm.kill_object(reshape1198) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_16_self_attn_out_proj_bias4) + gv3031: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2264: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3031, R.dtype("float16")) + cls.add5(alloc2256, alloc2263, alloc2264) + R.vm.kill_object(alloc2256) + R.vm.kill_object(alloc2263) + model_decoder_layers_16_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[889] + model_decoder_layers_16_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[890] + gv3032: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2265: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3032, R.dtype("float16")) + cls.layer_norm2(alloc2264, model_decoder_layers_16_encoder_attn_layer_norm_weight4, model_decoder_layers_16_encoder_attn_layer_norm_bias4, alloc2265) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_layer_norm_bias4) + model_decoder_layers_16_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[885] + model_decoder_layers_16_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[886] + gv3033: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2266: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3033, R.dtype("float16")) + _2265: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_q_proj_weight4, alloc2265, model_decoder_layers_16_encoder_attn_q_proj_bias4, alloc2266) + R.vm.kill_object(alloc2265) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_q_proj_bias4) + gv3034: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1199: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2266, gv3034, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2266) + gv3035: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1200: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1199, gv3035, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1199) + gv3036: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2267: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3036, R.dtype("float16")) + _2266: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(16), R.prim_value(T.float32(1)), reshape1200, alloc2267) + R.vm.kill_object(reshape1200) + gv3037: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1201: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2267, gv3037, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2267) + gv3038: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1202: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1201, gv3038, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1201) + model_decoder_layers_16_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[887] + model_decoder_layers_16_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[888] + gv3039: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2268: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3039, R.dtype("float16")) + _2267: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_16_encoder_attn_out_proj_weight4, reshape1202, model_decoder_layers_16_encoder_attn_out_proj_bias4, alloc2268) + R.vm.kill_object(reshape1202) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_16_encoder_attn_out_proj_bias4) + gv3040: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2269: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3040, R.dtype("float16")) + cls.add5(alloc2264, alloc2268, alloc2269) + R.vm.kill_object(alloc2264) + R.vm.kill_object(alloc2268) + model_decoder_layers_16_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[895] + model_decoder_layers_16_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[896] + gv3041: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2270: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3041, R.dtype("float16")) + cls.layer_norm2(alloc2269, model_decoder_layers_16_final_layer_norm_weight4, model_decoder_layers_16_final_layer_norm_bias4, alloc2270) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_16_final_layer_norm_bias4) + model_decoder_layers_16_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[891] + model_decoder_layers_16_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[892] + gv3042: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2271: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3042, R.dtype("float16")) + _2270: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_16_fc1_weight4, alloc2270, model_decoder_layers_16_fc1_bias4, alloc2271) + R.vm.kill_object(alloc2270) + R.vm.kill_object(model_decoder_layers_16_fc1_weight4) + R.vm.kill_object(model_decoder_layers_16_fc1_bias4) + model_decoder_layers_16_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[893] + model_decoder_layers_16_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[894] + gv3043: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2272: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3043, R.dtype("float16")) + _2271: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_16_fc2_weight4, alloc2271, model_decoder_layers_16_fc2_bias4, alloc2272) + R.vm.kill_object(alloc2271) + R.vm.kill_object(model_decoder_layers_16_fc2_weight4) + R.vm.kill_object(model_decoder_layers_16_fc2_bias4) + gv3044: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2273: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3044, R.dtype("float16")) + cls.add5(alloc2269, alloc2272, alloc2273) + R.vm.kill_object(alloc2269) + R.vm.kill_object(alloc2272) + model_decoder_layers_17_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[904] + model_decoder_layers_17_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[905] + gv3045: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2274: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3045, R.dtype("float16")) + cls.layer_norm2(alloc2273, model_decoder_layers_17_self_attn_layer_norm_weight4, model_decoder_layers_17_self_attn_layer_norm_bias4, alloc2274) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_17_self_attn_layer_norm_bias4) + model_decoder_layers_17_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[900] + model_decoder_layers_17_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[901] + gv3046: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2275: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3046, R.dtype("float16")) + _2274: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_q_proj_weight4, alloc2274, model_decoder_layers_17_self_attn_q_proj_bias4, alloc2275) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_17_self_attn_q_proj_bias4) + gv3047: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1203: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2275, gv3047, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2275) + model_decoder_layers_17_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[897] + gv3048: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2276: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3048, R.dtype("float16")) + _2275: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_17_self_attn_k_proj_weight4, alloc2274, alloc2276) + R.vm.kill_object(model_decoder_layers_17_self_attn_k_proj_weight4) + gv3049: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1204: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2276, gv3049, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2276) + model_decoder_layers_17_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[898] + model_decoder_layers_17_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[899] + gv3050: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2277: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3050, R.dtype("float16")) + _2276: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_v_proj_weight4, alloc2274, model_decoder_layers_17_self_attn_v_proj_bias4, alloc2277) + R.vm.kill_object(alloc2274) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_17_self_attn_v_proj_bias4) + gv3051: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1205: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2277, gv3051, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2277) + gv3052: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2278: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3052, R.dtype("float16")) + cls.concatenate1(reshape1203, reshape1204, reshape1205, alloc2278) + R.vm.kill_object(reshape1203) + R.vm.kill_object(reshape1204) + R.vm.kill_object(reshape1205) + gv3053: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1206: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2278, gv3053, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2278) + gv3054: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2279: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3054, R.dtype("float16")) + _2278: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape1206, alloc2279) + R.vm.kill_object(reshape1206) + gv3055: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1207: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2279, gv3055, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2279) + gv3056: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1208: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1207, gv3056, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1207) + model_decoder_layers_17_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[902] + model_decoder_layers_17_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[903] + gv3057: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2280: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3057, R.dtype("float16")) + _2279: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_self_attn_out_proj_weight4, reshape1208, model_decoder_layers_17_self_attn_out_proj_bias4, alloc2280) + R.vm.kill_object(reshape1208) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_17_self_attn_out_proj_bias4) + gv3058: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2281: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3058, R.dtype("float16")) + cls.add5(alloc2273, alloc2280, alloc2281) + R.vm.kill_object(alloc2273) + R.vm.kill_object(alloc2280) + model_decoder_layers_17_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[913] + model_decoder_layers_17_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[914] + gv3059: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2282: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3059, R.dtype("float16")) + cls.layer_norm2(alloc2281, model_decoder_layers_17_encoder_attn_layer_norm_weight4, model_decoder_layers_17_encoder_attn_layer_norm_bias4, alloc2282) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_layer_norm_bias4) + model_decoder_layers_17_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[909] + model_decoder_layers_17_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[910] + gv3060: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2283: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3060, R.dtype("float16")) + _2282: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_q_proj_weight4, alloc2282, model_decoder_layers_17_encoder_attn_q_proj_bias4, alloc2283) + R.vm.kill_object(alloc2282) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_q_proj_bias4) + gv3061: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1209: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2283, gv3061, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2283) + gv3062: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1210: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1209, gv3062, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1209) + gv3063: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2284: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3063, R.dtype("float16")) + _2283: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(17), R.prim_value(T.float32(1)), reshape1210, alloc2284) + R.vm.kill_object(reshape1210) + gv3064: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1211: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2284, gv3064, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2284) + gv3065: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1212: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1211, gv3065, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1211) + model_decoder_layers_17_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[911] + model_decoder_layers_17_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[912] + gv3066: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2285: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3066, R.dtype("float16")) + _2284: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_17_encoder_attn_out_proj_weight4, reshape1212, model_decoder_layers_17_encoder_attn_out_proj_bias4, alloc2285) + R.vm.kill_object(reshape1212) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_17_encoder_attn_out_proj_bias4) + gv3067: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2286: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3067, R.dtype("float16")) + cls.add5(alloc2281, alloc2285, alloc2286) + R.vm.kill_object(alloc2281) + R.vm.kill_object(alloc2285) + model_decoder_layers_17_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[919] + model_decoder_layers_17_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[920] + gv3068: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2287: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3068, R.dtype("float16")) + cls.layer_norm2(alloc2286, model_decoder_layers_17_final_layer_norm_weight4, model_decoder_layers_17_final_layer_norm_bias4, alloc2287) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_17_final_layer_norm_bias4) + model_decoder_layers_17_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[915] + model_decoder_layers_17_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[916] + gv3069: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2288: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3069, R.dtype("float16")) + _2287: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_17_fc1_weight4, alloc2287, model_decoder_layers_17_fc1_bias4, alloc2288) + R.vm.kill_object(alloc2287) + R.vm.kill_object(model_decoder_layers_17_fc1_weight4) + R.vm.kill_object(model_decoder_layers_17_fc1_bias4) + model_decoder_layers_17_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[917] + model_decoder_layers_17_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[918] + gv3070: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2289: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3070, R.dtype("float16")) + _2288: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_17_fc2_weight4, alloc2288, model_decoder_layers_17_fc2_bias4, alloc2289) + R.vm.kill_object(alloc2288) + R.vm.kill_object(model_decoder_layers_17_fc2_weight4) + R.vm.kill_object(model_decoder_layers_17_fc2_bias4) + gv3071: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2290: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3071, R.dtype("float16")) + cls.add5(alloc2286, alloc2289, alloc2290) + R.vm.kill_object(alloc2286) + R.vm.kill_object(alloc2289) + model_decoder_layers_18_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[928] + model_decoder_layers_18_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[929] + gv3072: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2291: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3072, R.dtype("float16")) + cls.layer_norm2(alloc2290, model_decoder_layers_18_self_attn_layer_norm_weight4, model_decoder_layers_18_self_attn_layer_norm_bias4, alloc2291) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_18_self_attn_layer_norm_bias4) + model_decoder_layers_18_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[924] + model_decoder_layers_18_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[925] + gv3073: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2292: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3073, R.dtype("float16")) + _2291: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_q_proj_weight4, alloc2291, model_decoder_layers_18_self_attn_q_proj_bias4, alloc2292) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_18_self_attn_q_proj_bias4) + gv3074: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1213: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2292, gv3074, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2292) + model_decoder_layers_18_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[921] + gv3075: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2293: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3075, R.dtype("float16")) + _2292: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_18_self_attn_k_proj_weight4, alloc2291, alloc2293) + R.vm.kill_object(model_decoder_layers_18_self_attn_k_proj_weight4) + gv3076: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1214: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2293, gv3076, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2293) + model_decoder_layers_18_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[922] + model_decoder_layers_18_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[923] + gv3077: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2294: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3077, R.dtype("float16")) + _2293: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_v_proj_weight4, alloc2291, model_decoder_layers_18_self_attn_v_proj_bias4, alloc2294) + R.vm.kill_object(alloc2291) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_18_self_attn_v_proj_bias4) + gv3078: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1215: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2294, gv3078, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2294) + gv3079: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2295: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3079, R.dtype("float16")) + cls.concatenate1(reshape1213, reshape1214, reshape1215, alloc2295) + R.vm.kill_object(reshape1213) + R.vm.kill_object(reshape1214) + R.vm.kill_object(reshape1215) + gv3080: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1216: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2295, gv3080, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2295) + gv3081: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2296: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3081, R.dtype("float16")) + _2295: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape1216, alloc2296) + R.vm.kill_object(reshape1216) + gv3082: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1217: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2296, gv3082, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2296) + gv3083: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1218: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1217, gv3083, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1217) + model_decoder_layers_18_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[926] + model_decoder_layers_18_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[927] + gv3084: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2297: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3084, R.dtype("float16")) + _2296: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_self_attn_out_proj_weight4, reshape1218, model_decoder_layers_18_self_attn_out_proj_bias4, alloc2297) + R.vm.kill_object(reshape1218) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_18_self_attn_out_proj_bias4) + gv3085: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2298: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3085, R.dtype("float16")) + cls.add5(alloc2290, alloc2297, alloc2298) + R.vm.kill_object(alloc2290) + R.vm.kill_object(alloc2297) + model_decoder_layers_18_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[937] + model_decoder_layers_18_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[938] + gv3086: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2299: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3086, R.dtype("float16")) + cls.layer_norm2(alloc2298, model_decoder_layers_18_encoder_attn_layer_norm_weight4, model_decoder_layers_18_encoder_attn_layer_norm_bias4, alloc2299) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_layer_norm_bias4) + model_decoder_layers_18_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[933] + model_decoder_layers_18_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[934] + gv3087: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2300: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3087, R.dtype("float16")) + _2299: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_q_proj_weight4, alloc2299, model_decoder_layers_18_encoder_attn_q_proj_bias4, alloc2300) + R.vm.kill_object(alloc2299) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_q_proj_bias4) + gv3088: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1219: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2300, gv3088, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2300) + gv3089: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1220: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1219, gv3089, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1219) + gv3090: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2301: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3090, R.dtype("float16")) + _2300: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(18), R.prim_value(T.float32(1)), reshape1220, alloc2301) + R.vm.kill_object(reshape1220) + gv3091: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1221: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2301, gv3091, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2301) + gv3092: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1222: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1221, gv3092, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1221) + model_decoder_layers_18_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[935] + model_decoder_layers_18_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[936] + gv3093: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2302: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3093, R.dtype("float16")) + _2301: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_18_encoder_attn_out_proj_weight4, reshape1222, model_decoder_layers_18_encoder_attn_out_proj_bias4, alloc2302) + R.vm.kill_object(reshape1222) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_18_encoder_attn_out_proj_bias4) + gv3094: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2303: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3094, R.dtype("float16")) + cls.add5(alloc2298, alloc2302, alloc2303) + R.vm.kill_object(alloc2298) + R.vm.kill_object(alloc2302) + model_decoder_layers_18_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[943] + model_decoder_layers_18_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[944] + gv3095: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2304: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3095, R.dtype("float16")) + cls.layer_norm2(alloc2303, model_decoder_layers_18_final_layer_norm_weight4, model_decoder_layers_18_final_layer_norm_bias4, alloc2304) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_18_final_layer_norm_bias4) + model_decoder_layers_18_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[939] + model_decoder_layers_18_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[940] + gv3096: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2305: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3096, R.dtype("float16")) + _2304: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_18_fc1_weight4, alloc2304, model_decoder_layers_18_fc1_bias4, alloc2305) + R.vm.kill_object(alloc2304) + R.vm.kill_object(model_decoder_layers_18_fc1_weight4) + R.vm.kill_object(model_decoder_layers_18_fc1_bias4) + model_decoder_layers_18_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[941] + model_decoder_layers_18_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[942] + gv3097: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2306: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3097, R.dtype("float16")) + _2305: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_18_fc2_weight4, alloc2305, model_decoder_layers_18_fc2_bias4, alloc2306) + R.vm.kill_object(alloc2305) + R.vm.kill_object(model_decoder_layers_18_fc2_weight4) + R.vm.kill_object(model_decoder_layers_18_fc2_bias4) + gv3098: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2307: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3098, R.dtype("float16")) + cls.add5(alloc2303, alloc2306, alloc2307) + R.vm.kill_object(alloc2303) + R.vm.kill_object(alloc2306) + model_decoder_layers_19_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[952] + model_decoder_layers_19_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[953] + gv3099: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2308: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3099, R.dtype("float16")) + cls.layer_norm2(alloc2307, model_decoder_layers_19_self_attn_layer_norm_weight4, model_decoder_layers_19_self_attn_layer_norm_bias4, alloc2308) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_19_self_attn_layer_norm_bias4) + model_decoder_layers_19_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[948] + model_decoder_layers_19_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[949] + gv3100: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2309: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3100, R.dtype("float16")) + _2308: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_q_proj_weight4, alloc2308, model_decoder_layers_19_self_attn_q_proj_bias4, alloc2309) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_19_self_attn_q_proj_bias4) + gv3101: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1223: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2309, gv3101, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2309) + model_decoder_layers_19_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[945] + gv3102: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2310: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3102, R.dtype("float16")) + _2309: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_19_self_attn_k_proj_weight4, alloc2308, alloc2310) + R.vm.kill_object(model_decoder_layers_19_self_attn_k_proj_weight4) + gv3103: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1224: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2310, gv3103, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2310) + model_decoder_layers_19_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[946] + model_decoder_layers_19_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[947] + gv3104: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2311: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3104, R.dtype("float16")) + _2310: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_v_proj_weight4, alloc2308, model_decoder_layers_19_self_attn_v_proj_bias4, alloc2311) + R.vm.kill_object(alloc2308) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_19_self_attn_v_proj_bias4) + gv3105: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1225: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2311, gv3105, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2311) + gv3106: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2312: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3106, R.dtype("float16")) + cls.concatenate1(reshape1223, reshape1224, reshape1225, alloc2312) + R.vm.kill_object(reshape1223) + R.vm.kill_object(reshape1224) + R.vm.kill_object(reshape1225) + gv3107: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1226: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2312, gv3107, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2312) + gv3108: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2313: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3108, R.dtype("float16")) + _2312: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape1226, alloc2313) + R.vm.kill_object(reshape1226) + gv3109: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1227: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2313, gv3109, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2313) + gv3110: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1228: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1227, gv3110, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1227) + model_decoder_layers_19_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[950] + model_decoder_layers_19_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[951] + gv3111: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2314: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3111, R.dtype("float16")) + _2313: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_self_attn_out_proj_weight4, reshape1228, model_decoder_layers_19_self_attn_out_proj_bias4, alloc2314) + R.vm.kill_object(reshape1228) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_19_self_attn_out_proj_bias4) + gv3112: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2315: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3112, R.dtype("float16")) + cls.add5(alloc2307, alloc2314, alloc2315) + R.vm.kill_object(alloc2307) + R.vm.kill_object(alloc2314) + model_decoder_layers_19_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[961] + model_decoder_layers_19_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[962] + gv3113: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2316: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3113, R.dtype("float16")) + cls.layer_norm2(alloc2315, model_decoder_layers_19_encoder_attn_layer_norm_weight4, model_decoder_layers_19_encoder_attn_layer_norm_bias4, alloc2316) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_layer_norm_bias4) + model_decoder_layers_19_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[957] + model_decoder_layers_19_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[958] + gv3114: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2317: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3114, R.dtype("float16")) + _2316: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_q_proj_weight4, alloc2316, model_decoder_layers_19_encoder_attn_q_proj_bias4, alloc2317) + R.vm.kill_object(alloc2316) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_q_proj_bias4) + gv3115: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1229: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2317, gv3115, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2317) + gv3116: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1230: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1229, gv3116, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1229) + gv3117: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2318: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3117, R.dtype("float16")) + _2317: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(19), R.prim_value(T.float32(1)), reshape1230, alloc2318) + R.vm.kill_object(reshape1230) + gv3118: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1231: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2318, gv3118, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2318) + gv3119: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1232: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1231, gv3119, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1231) + model_decoder_layers_19_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[959] + model_decoder_layers_19_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[960] + gv3120: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2319: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3120, R.dtype("float16")) + _2318: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_19_encoder_attn_out_proj_weight4, reshape1232, model_decoder_layers_19_encoder_attn_out_proj_bias4, alloc2319) + R.vm.kill_object(reshape1232) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_19_encoder_attn_out_proj_bias4) + gv3121: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2320: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3121, R.dtype("float16")) + cls.add5(alloc2315, alloc2319, alloc2320) + R.vm.kill_object(alloc2315) + R.vm.kill_object(alloc2319) + model_decoder_layers_19_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[967] + model_decoder_layers_19_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[968] + gv3122: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2321: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3122, R.dtype("float16")) + cls.layer_norm2(alloc2320, model_decoder_layers_19_final_layer_norm_weight4, model_decoder_layers_19_final_layer_norm_bias4, alloc2321) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_19_final_layer_norm_bias4) + model_decoder_layers_19_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[963] + model_decoder_layers_19_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[964] + gv3123: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2322: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3123, R.dtype("float16")) + _2321: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_19_fc1_weight4, alloc2321, model_decoder_layers_19_fc1_bias4, alloc2322) + R.vm.kill_object(alloc2321) + R.vm.kill_object(model_decoder_layers_19_fc1_weight4) + R.vm.kill_object(model_decoder_layers_19_fc1_bias4) + model_decoder_layers_19_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[965] + model_decoder_layers_19_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[966] + gv3124: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2323: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3124, R.dtype("float16")) + _2322: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_19_fc2_weight4, alloc2322, model_decoder_layers_19_fc2_bias4, alloc2323) + R.vm.kill_object(alloc2322) + R.vm.kill_object(model_decoder_layers_19_fc2_weight4) + R.vm.kill_object(model_decoder_layers_19_fc2_bias4) + gv3125: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2324: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3125, R.dtype("float16")) + cls.add5(alloc2320, alloc2323, alloc2324) + R.vm.kill_object(alloc2320) + R.vm.kill_object(alloc2323) + model_decoder_layers_20_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[976] + model_decoder_layers_20_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[977] + gv3126: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2325: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3126, R.dtype("float16")) + cls.layer_norm2(alloc2324, model_decoder_layers_20_self_attn_layer_norm_weight4, model_decoder_layers_20_self_attn_layer_norm_bias4, alloc2325) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_20_self_attn_layer_norm_bias4) + model_decoder_layers_20_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[972] + model_decoder_layers_20_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[973] + gv3127: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2326: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3127, R.dtype("float16")) + _2325: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_q_proj_weight4, alloc2325, model_decoder_layers_20_self_attn_q_proj_bias4, alloc2326) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_20_self_attn_q_proj_bias4) + gv3128: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1233: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2326, gv3128, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2326) + model_decoder_layers_20_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[969] + gv3129: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2327: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3129, R.dtype("float16")) + _2326: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_20_self_attn_k_proj_weight4, alloc2325, alloc2327) + R.vm.kill_object(model_decoder_layers_20_self_attn_k_proj_weight4) + gv3130: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1234: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2327, gv3130, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2327) + model_decoder_layers_20_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[970] + model_decoder_layers_20_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[971] + gv3131: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2328: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3131, R.dtype("float16")) + _2327: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_v_proj_weight4, alloc2325, model_decoder_layers_20_self_attn_v_proj_bias4, alloc2328) + R.vm.kill_object(alloc2325) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_20_self_attn_v_proj_bias4) + gv3132: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1235: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2328, gv3132, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2328) + gv3133: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2329: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3133, R.dtype("float16")) + cls.concatenate1(reshape1233, reshape1234, reshape1235, alloc2329) + R.vm.kill_object(reshape1233) + R.vm.kill_object(reshape1234) + R.vm.kill_object(reshape1235) + gv3134: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1236: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2329, gv3134, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2329) + gv3135: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2330: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3135, R.dtype("float16")) + _2329: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape1236, alloc2330) + R.vm.kill_object(reshape1236) + gv3136: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1237: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2330, gv3136, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2330) + gv3137: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1238: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1237, gv3137, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1237) + model_decoder_layers_20_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[974] + model_decoder_layers_20_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[975] + gv3138: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2331: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3138, R.dtype("float16")) + _2330: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_self_attn_out_proj_weight4, reshape1238, model_decoder_layers_20_self_attn_out_proj_bias4, alloc2331) + R.vm.kill_object(reshape1238) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_20_self_attn_out_proj_bias4) + gv3139: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2332: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3139, R.dtype("float16")) + cls.add5(alloc2324, alloc2331, alloc2332) + R.vm.kill_object(alloc2324) + R.vm.kill_object(alloc2331) + model_decoder_layers_20_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[985] + model_decoder_layers_20_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[986] + gv3140: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2333: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3140, R.dtype("float16")) + cls.layer_norm2(alloc2332, model_decoder_layers_20_encoder_attn_layer_norm_weight4, model_decoder_layers_20_encoder_attn_layer_norm_bias4, alloc2333) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_layer_norm_bias4) + model_decoder_layers_20_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[981] + model_decoder_layers_20_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[982] + gv3141: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2334: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3141, R.dtype("float16")) + _2333: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_q_proj_weight4, alloc2333, model_decoder_layers_20_encoder_attn_q_proj_bias4, alloc2334) + R.vm.kill_object(alloc2333) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_q_proj_bias4) + gv3142: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1239: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2334, gv3142, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2334) + gv3143: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1240: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1239, gv3143, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1239) + gv3144: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2335: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3144, R.dtype("float16")) + _2334: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(20), R.prim_value(T.float32(1)), reshape1240, alloc2335) + R.vm.kill_object(reshape1240) + gv3145: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1241: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2335, gv3145, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2335) + gv3146: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1242: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1241, gv3146, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1241) + model_decoder_layers_20_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[983] + model_decoder_layers_20_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[984] + gv3147: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2336: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3147, R.dtype("float16")) + _2335: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_20_encoder_attn_out_proj_weight4, reshape1242, model_decoder_layers_20_encoder_attn_out_proj_bias4, alloc2336) + R.vm.kill_object(reshape1242) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_20_encoder_attn_out_proj_bias4) + gv3148: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2337: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3148, R.dtype("float16")) + cls.add5(alloc2332, alloc2336, alloc2337) + R.vm.kill_object(alloc2332) + R.vm.kill_object(alloc2336) + model_decoder_layers_20_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[991] + model_decoder_layers_20_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[992] + gv3149: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2338: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3149, R.dtype("float16")) + cls.layer_norm2(alloc2337, model_decoder_layers_20_final_layer_norm_weight4, model_decoder_layers_20_final_layer_norm_bias4, alloc2338) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_20_final_layer_norm_bias4) + model_decoder_layers_20_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[987] + model_decoder_layers_20_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[988] + gv3150: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2339: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3150, R.dtype("float16")) + _2338: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_20_fc1_weight4, alloc2338, model_decoder_layers_20_fc1_bias4, alloc2339) + R.vm.kill_object(alloc2338) + R.vm.kill_object(model_decoder_layers_20_fc1_weight4) + R.vm.kill_object(model_decoder_layers_20_fc1_bias4) + model_decoder_layers_20_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[989] + model_decoder_layers_20_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[990] + gv3151: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2340: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3151, R.dtype("float16")) + _2339: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_20_fc2_weight4, alloc2339, model_decoder_layers_20_fc2_bias4, alloc2340) + R.vm.kill_object(alloc2339) + R.vm.kill_object(model_decoder_layers_20_fc2_weight4) + R.vm.kill_object(model_decoder_layers_20_fc2_bias4) + gv3152: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2341: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3152, R.dtype("float16")) + cls.add5(alloc2337, alloc2340, alloc2341) + R.vm.kill_object(alloc2337) + R.vm.kill_object(alloc2340) + model_decoder_layers_21_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1000] + model_decoder_layers_21_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1001] + gv3153: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2342: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3153, R.dtype("float16")) + cls.layer_norm2(alloc2341, model_decoder_layers_21_self_attn_layer_norm_weight4, model_decoder_layers_21_self_attn_layer_norm_bias4, alloc2342) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_21_self_attn_layer_norm_bias4) + model_decoder_layers_21_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[996] + model_decoder_layers_21_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[997] + gv3154: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2343: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3154, R.dtype("float16")) + _2342: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_q_proj_weight4, alloc2342, model_decoder_layers_21_self_attn_q_proj_bias4, alloc2343) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_21_self_attn_q_proj_bias4) + gv3155: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1243: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2343, gv3155, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2343) + model_decoder_layers_21_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[993] + gv3156: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2344: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3156, R.dtype("float16")) + _2343: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_21_self_attn_k_proj_weight4, alloc2342, alloc2344) + R.vm.kill_object(model_decoder_layers_21_self_attn_k_proj_weight4) + gv3157: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1244: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2344, gv3157, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2344) + model_decoder_layers_21_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[994] + model_decoder_layers_21_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[995] + gv3158: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2345: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3158, R.dtype("float16")) + _2344: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_v_proj_weight4, alloc2342, model_decoder_layers_21_self_attn_v_proj_bias4, alloc2345) + R.vm.kill_object(alloc2342) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_21_self_attn_v_proj_bias4) + gv3159: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1245: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2345, gv3159, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2345) + gv3160: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2346: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3160, R.dtype("float16")) + cls.concatenate1(reshape1243, reshape1244, reshape1245, alloc2346) + R.vm.kill_object(reshape1243) + R.vm.kill_object(reshape1244) + R.vm.kill_object(reshape1245) + gv3161: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1246: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2346, gv3161, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2346) + gv3162: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2347: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3162, R.dtype("float16")) + _2346: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape1246, alloc2347) + R.vm.kill_object(reshape1246) + gv3163: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1247: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2347, gv3163, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2347) + gv3164: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1248: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1247, gv3164, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1247) + model_decoder_layers_21_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[998] + model_decoder_layers_21_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[999] + gv3165: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2348: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3165, R.dtype("float16")) + _2347: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_self_attn_out_proj_weight4, reshape1248, model_decoder_layers_21_self_attn_out_proj_bias4, alloc2348) + R.vm.kill_object(reshape1248) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_21_self_attn_out_proj_bias4) + gv3166: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2349: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3166, R.dtype("float16")) + cls.add5(alloc2341, alloc2348, alloc2349) + R.vm.kill_object(alloc2341) + R.vm.kill_object(alloc2348) + model_decoder_layers_21_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1009] + model_decoder_layers_21_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1010] + gv3167: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2350: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3167, R.dtype("float16")) + cls.layer_norm2(alloc2349, model_decoder_layers_21_encoder_attn_layer_norm_weight4, model_decoder_layers_21_encoder_attn_layer_norm_bias4, alloc2350) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_layer_norm_bias4) + model_decoder_layers_21_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1005] + model_decoder_layers_21_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1006] + gv3168: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2351: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3168, R.dtype("float16")) + _2350: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_q_proj_weight4, alloc2350, model_decoder_layers_21_encoder_attn_q_proj_bias4, alloc2351) + R.vm.kill_object(alloc2350) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_q_proj_bias4) + gv3169: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1249: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2351, gv3169, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2351) + gv3170: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1250: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1249, gv3170, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1249) + gv3171: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2352: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3171, R.dtype("float16")) + _2351: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(21), R.prim_value(T.float32(1)), reshape1250, alloc2352) + R.vm.kill_object(reshape1250) + gv3172: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1251: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2352, gv3172, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2352) + gv3173: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1252: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1251, gv3173, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1251) + model_decoder_layers_21_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1007] + model_decoder_layers_21_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1008] + gv3174: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2353: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3174, R.dtype("float16")) + _2352: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_21_encoder_attn_out_proj_weight4, reshape1252, model_decoder_layers_21_encoder_attn_out_proj_bias4, alloc2353) + R.vm.kill_object(reshape1252) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_21_encoder_attn_out_proj_bias4) + gv3175: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2354: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3175, R.dtype("float16")) + cls.add5(alloc2349, alloc2353, alloc2354) + R.vm.kill_object(alloc2349) + R.vm.kill_object(alloc2353) + model_decoder_layers_21_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1015] + model_decoder_layers_21_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1016] + gv3176: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2355: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3176, R.dtype("float16")) + cls.layer_norm2(alloc2354, model_decoder_layers_21_final_layer_norm_weight4, model_decoder_layers_21_final_layer_norm_bias4, alloc2355) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_21_final_layer_norm_bias4) + model_decoder_layers_21_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1011] + model_decoder_layers_21_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1012] + gv3177: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2356: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3177, R.dtype("float16")) + _2355: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_21_fc1_weight4, alloc2355, model_decoder_layers_21_fc1_bias4, alloc2356) + R.vm.kill_object(alloc2355) + R.vm.kill_object(model_decoder_layers_21_fc1_weight4) + R.vm.kill_object(model_decoder_layers_21_fc1_bias4) + model_decoder_layers_21_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1013] + model_decoder_layers_21_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1014] + gv3178: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2357: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3178, R.dtype("float16")) + _2356: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_21_fc2_weight4, alloc2356, model_decoder_layers_21_fc2_bias4, alloc2357) + R.vm.kill_object(alloc2356) + R.vm.kill_object(model_decoder_layers_21_fc2_weight4) + R.vm.kill_object(model_decoder_layers_21_fc2_bias4) + gv3179: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2358: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3179, R.dtype("float16")) + cls.add5(alloc2354, alloc2357, alloc2358) + R.vm.kill_object(alloc2354) + R.vm.kill_object(alloc2357) + model_decoder_layers_22_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1024] + model_decoder_layers_22_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1025] + gv3180: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2359: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3180, R.dtype("float16")) + cls.layer_norm2(alloc2358, model_decoder_layers_22_self_attn_layer_norm_weight4, model_decoder_layers_22_self_attn_layer_norm_bias4, alloc2359) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_22_self_attn_layer_norm_bias4) + model_decoder_layers_22_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1020] + model_decoder_layers_22_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1021] + gv3181: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2360: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3181, R.dtype("float16")) + _2359: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_q_proj_weight4, alloc2359, model_decoder_layers_22_self_attn_q_proj_bias4, alloc2360) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_22_self_attn_q_proj_bias4) + gv3182: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1253: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2360, gv3182, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2360) + model_decoder_layers_22_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1017] + gv3183: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2361: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3183, R.dtype("float16")) + _2360: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_22_self_attn_k_proj_weight4, alloc2359, alloc2361) + R.vm.kill_object(model_decoder_layers_22_self_attn_k_proj_weight4) + gv3184: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1254: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2361, gv3184, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2361) + model_decoder_layers_22_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1018] + model_decoder_layers_22_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1019] + gv3185: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2362: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3185, R.dtype("float16")) + _2361: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_v_proj_weight4, alloc2359, model_decoder_layers_22_self_attn_v_proj_bias4, alloc2362) + R.vm.kill_object(alloc2359) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_22_self_attn_v_proj_bias4) + gv3186: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1255: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2362, gv3186, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2362) + gv3187: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2363: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3187, R.dtype("float16")) + cls.concatenate1(reshape1253, reshape1254, reshape1255, alloc2363) + R.vm.kill_object(reshape1253) + R.vm.kill_object(reshape1254) + R.vm.kill_object(reshape1255) + gv3188: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1256: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2363, gv3188, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2363) + gv3189: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2364: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3189, R.dtype("float16")) + _2363: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape1256, alloc2364) + R.vm.kill_object(reshape1256) + gv3190: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1257: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2364, gv3190, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2364) + gv3191: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1258: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1257, gv3191, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1257) + model_decoder_layers_22_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1022] + model_decoder_layers_22_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1023] + gv3192: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2365: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3192, R.dtype("float16")) + _2364: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_self_attn_out_proj_weight4, reshape1258, model_decoder_layers_22_self_attn_out_proj_bias4, alloc2365) + R.vm.kill_object(reshape1258) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_22_self_attn_out_proj_bias4) + gv3193: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2366: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3193, R.dtype("float16")) + cls.add5(alloc2358, alloc2365, alloc2366) + R.vm.kill_object(alloc2358) + R.vm.kill_object(alloc2365) + model_decoder_layers_22_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1033] + model_decoder_layers_22_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1034] + gv3194: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2367: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3194, R.dtype("float16")) + cls.layer_norm2(alloc2366, model_decoder_layers_22_encoder_attn_layer_norm_weight4, model_decoder_layers_22_encoder_attn_layer_norm_bias4, alloc2367) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_layer_norm_bias4) + model_decoder_layers_22_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1029] + model_decoder_layers_22_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1030] + gv3195: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2368: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3195, R.dtype("float16")) + _2367: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_q_proj_weight4, alloc2367, model_decoder_layers_22_encoder_attn_q_proj_bias4, alloc2368) + R.vm.kill_object(alloc2367) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_q_proj_bias4) + gv3196: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1259: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2368, gv3196, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2368) + gv3197: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1260: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1259, gv3197, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1259) + gv3198: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2369: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3198, R.dtype("float16")) + _2368: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(22), R.prim_value(T.float32(1)), reshape1260, alloc2369) + R.vm.kill_object(reshape1260) + gv3199: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1261: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2369, gv3199, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2369) + gv3200: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1262: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1261, gv3200, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1261) + model_decoder_layers_22_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1031] + model_decoder_layers_22_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1032] + gv3201: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2370: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3201, R.dtype("float16")) + _2369: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_22_encoder_attn_out_proj_weight4, reshape1262, model_decoder_layers_22_encoder_attn_out_proj_bias4, alloc2370) + R.vm.kill_object(reshape1262) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_22_encoder_attn_out_proj_bias4) + gv3202: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2371: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3202, R.dtype("float16")) + cls.add5(alloc2366, alloc2370, alloc2371) + R.vm.kill_object(alloc2366) + R.vm.kill_object(alloc2370) + model_decoder_layers_22_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1039] + model_decoder_layers_22_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1040] + gv3203: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2372: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3203, R.dtype("float16")) + cls.layer_norm2(alloc2371, model_decoder_layers_22_final_layer_norm_weight4, model_decoder_layers_22_final_layer_norm_bias4, alloc2372) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_22_final_layer_norm_bias4) + model_decoder_layers_22_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1035] + model_decoder_layers_22_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1036] + gv3204: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2373: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3204, R.dtype("float16")) + _2372: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_22_fc1_weight4, alloc2372, model_decoder_layers_22_fc1_bias4, alloc2373) + R.vm.kill_object(alloc2372) + R.vm.kill_object(model_decoder_layers_22_fc1_weight4) + R.vm.kill_object(model_decoder_layers_22_fc1_bias4) + model_decoder_layers_22_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1037] + model_decoder_layers_22_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1038] + gv3205: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2374: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3205, R.dtype("float16")) + _2373: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_22_fc2_weight4, alloc2373, model_decoder_layers_22_fc2_bias4, alloc2374) + R.vm.kill_object(alloc2373) + R.vm.kill_object(model_decoder_layers_22_fc2_weight4) + R.vm.kill_object(model_decoder_layers_22_fc2_bias4) + gv3206: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2375: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3206, R.dtype("float16")) + cls.add5(alloc2371, alloc2374, alloc2375) + R.vm.kill_object(alloc2371) + R.vm.kill_object(alloc2374) + model_decoder_layers_23_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1048] + model_decoder_layers_23_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1049] + gv3207: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2376: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3207, R.dtype("float16")) + cls.layer_norm2(alloc2375, model_decoder_layers_23_self_attn_layer_norm_weight4, model_decoder_layers_23_self_attn_layer_norm_bias4, alloc2376) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_23_self_attn_layer_norm_bias4) + model_decoder_layers_23_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1044] + model_decoder_layers_23_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1045] + gv3208: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2377: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3208, R.dtype("float16")) + _2376: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_q_proj_weight4, alloc2376, model_decoder_layers_23_self_attn_q_proj_bias4, alloc2377) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_23_self_attn_q_proj_bias4) + gv3209: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1263: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2377, gv3209, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2377) + model_decoder_layers_23_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1041] + gv3210: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2378: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3210, R.dtype("float16")) + _2377: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_23_self_attn_k_proj_weight4, alloc2376, alloc2378) + R.vm.kill_object(model_decoder_layers_23_self_attn_k_proj_weight4) + gv3211: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1264: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2378, gv3211, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2378) + model_decoder_layers_23_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1042] + model_decoder_layers_23_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1043] + gv3212: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2379: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3212, R.dtype("float16")) + _2378: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_v_proj_weight4, alloc2376, model_decoder_layers_23_self_attn_v_proj_bias4, alloc2379) + R.vm.kill_object(alloc2376) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_23_self_attn_v_proj_bias4) + gv3213: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1265: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2379, gv3213, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2379) + gv3214: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2380: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3214, R.dtype("float16")) + cls.concatenate1(reshape1263, reshape1264, reshape1265, alloc2380) + R.vm.kill_object(reshape1263) + R.vm.kill_object(reshape1264) + R.vm.kill_object(reshape1265) + gv3215: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1266: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2380, gv3215, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2380) + gv3216: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2381: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3216, R.dtype("float16")) + _2380: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape1266, alloc2381) + R.vm.kill_object(reshape1266) + gv3217: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1267: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2381, gv3217, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2381) + gv3218: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1268: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1267, gv3218, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1267) + model_decoder_layers_23_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1046] + model_decoder_layers_23_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1047] + gv3219: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2382: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3219, R.dtype("float16")) + _2381: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_self_attn_out_proj_weight4, reshape1268, model_decoder_layers_23_self_attn_out_proj_bias4, alloc2382) + R.vm.kill_object(reshape1268) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_23_self_attn_out_proj_bias4) + gv3220: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2383: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3220, R.dtype("float16")) + cls.add5(alloc2375, alloc2382, alloc2383) + R.vm.kill_object(alloc2375) + R.vm.kill_object(alloc2382) + model_decoder_layers_23_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1057] + model_decoder_layers_23_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1058] + gv3221: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2384: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3221, R.dtype("float16")) + cls.layer_norm2(alloc2383, model_decoder_layers_23_encoder_attn_layer_norm_weight4, model_decoder_layers_23_encoder_attn_layer_norm_bias4, alloc2384) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_layer_norm_bias4) + model_decoder_layers_23_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1053] + model_decoder_layers_23_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1054] + gv3222: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2385: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3222, R.dtype("float16")) + _2384: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_q_proj_weight4, alloc2384, model_decoder_layers_23_encoder_attn_q_proj_bias4, alloc2385) + R.vm.kill_object(alloc2384) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_q_proj_bias4) + gv3223: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1269: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2385, gv3223, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2385) + gv3224: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1270: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1269, gv3224, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1269) + gv3225: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2386: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3225, R.dtype("float16")) + _2385: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(23), R.prim_value(T.float32(1)), reshape1270, alloc2386) + R.vm.kill_object(reshape1270) + gv3226: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1271: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2386, gv3226, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2386) + gv3227: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1272: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1271, gv3227, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1271) + model_decoder_layers_23_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1055] + model_decoder_layers_23_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1056] + gv3228: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2387: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3228, R.dtype("float16")) + _2386: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_23_encoder_attn_out_proj_weight4, reshape1272, model_decoder_layers_23_encoder_attn_out_proj_bias4, alloc2387) + R.vm.kill_object(reshape1272) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_23_encoder_attn_out_proj_bias4) + gv3229: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2388: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3229, R.dtype("float16")) + cls.add5(alloc2383, alloc2387, alloc2388) + R.vm.kill_object(alloc2383) + R.vm.kill_object(alloc2387) + model_decoder_layers_23_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1063] + model_decoder_layers_23_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1064] + gv3230: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2389: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3230, R.dtype("float16")) + cls.layer_norm2(alloc2388, model_decoder_layers_23_final_layer_norm_weight4, model_decoder_layers_23_final_layer_norm_bias4, alloc2389) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_23_final_layer_norm_bias4) + model_decoder_layers_23_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1059] + model_decoder_layers_23_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1060] + gv3231: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2390: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3231, R.dtype("float16")) + _2389: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_23_fc1_weight4, alloc2389, model_decoder_layers_23_fc1_bias4, alloc2390) + R.vm.kill_object(alloc2389) + R.vm.kill_object(model_decoder_layers_23_fc1_weight4) + R.vm.kill_object(model_decoder_layers_23_fc1_bias4) + model_decoder_layers_23_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1061] + model_decoder_layers_23_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1062] + gv3232: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2391: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3232, R.dtype("float16")) + _2390: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_23_fc2_weight4, alloc2390, model_decoder_layers_23_fc2_bias4, alloc2391) + R.vm.kill_object(alloc2390) + R.vm.kill_object(model_decoder_layers_23_fc2_weight4) + R.vm.kill_object(model_decoder_layers_23_fc2_bias4) + gv3233: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2392: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3233, R.dtype("float16")) + cls.add5(alloc2388, alloc2391, alloc2392) + R.vm.kill_object(alloc2388) + R.vm.kill_object(alloc2391) + model_decoder_layers_24_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1072] + model_decoder_layers_24_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1073] + gv3234: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2393: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3234, R.dtype("float16")) + cls.layer_norm2(alloc2392, model_decoder_layers_24_self_attn_layer_norm_weight4, model_decoder_layers_24_self_attn_layer_norm_bias4, alloc2393) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_24_self_attn_layer_norm_bias4) + model_decoder_layers_24_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1068] + model_decoder_layers_24_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1069] + gv3235: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2394: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3235, R.dtype("float16")) + _2393: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_q_proj_weight4, alloc2393, model_decoder_layers_24_self_attn_q_proj_bias4, alloc2394) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_24_self_attn_q_proj_bias4) + gv3236: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1273: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2394, gv3236, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2394) + model_decoder_layers_24_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1065] + gv3237: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2395: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3237, R.dtype("float16")) + _2394: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_24_self_attn_k_proj_weight4, alloc2393, alloc2395) + R.vm.kill_object(model_decoder_layers_24_self_attn_k_proj_weight4) + gv3238: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1274: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2395, gv3238, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2395) + model_decoder_layers_24_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1066] + model_decoder_layers_24_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1067] + gv3239: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2396: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3239, R.dtype("float16")) + _2395: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_v_proj_weight4, alloc2393, model_decoder_layers_24_self_attn_v_proj_bias4, alloc2396) + R.vm.kill_object(alloc2393) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_24_self_attn_v_proj_bias4) + gv3240: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1275: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2396, gv3240, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2396) + gv3241: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2397: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3241, R.dtype("float16")) + cls.concatenate1(reshape1273, reshape1274, reshape1275, alloc2397) + R.vm.kill_object(reshape1273) + R.vm.kill_object(reshape1274) + R.vm.kill_object(reshape1275) + gv3242: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1276: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2397, gv3242, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2397) + gv3243: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2398: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3243, R.dtype("float16")) + _2397: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape1276, alloc2398) + R.vm.kill_object(reshape1276) + gv3244: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1277: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2398, gv3244, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2398) + gv3245: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1278: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1277, gv3245, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1277) + model_decoder_layers_24_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1070] + model_decoder_layers_24_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1071] + gv3246: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2399: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3246, R.dtype("float16")) + _2398: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_self_attn_out_proj_weight4, reshape1278, model_decoder_layers_24_self_attn_out_proj_bias4, alloc2399) + R.vm.kill_object(reshape1278) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_24_self_attn_out_proj_bias4) + gv3247: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2400: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3247, R.dtype("float16")) + cls.add5(alloc2392, alloc2399, alloc2400) + R.vm.kill_object(alloc2392) + R.vm.kill_object(alloc2399) + model_decoder_layers_24_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1081] + model_decoder_layers_24_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1082] + gv3248: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2401: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3248, R.dtype("float16")) + cls.layer_norm2(alloc2400, model_decoder_layers_24_encoder_attn_layer_norm_weight4, model_decoder_layers_24_encoder_attn_layer_norm_bias4, alloc2401) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_layer_norm_bias4) + model_decoder_layers_24_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1077] + model_decoder_layers_24_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1078] + gv3249: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2402: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3249, R.dtype("float16")) + _2401: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_q_proj_weight4, alloc2401, model_decoder_layers_24_encoder_attn_q_proj_bias4, alloc2402) + R.vm.kill_object(alloc2401) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_q_proj_bias4) + gv3250: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1279: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2402, gv3250, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2402) + gv3251: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1280: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1279, gv3251, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1279) + gv3252: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2403: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3252, R.dtype("float16")) + _2402: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(24), R.prim_value(T.float32(1)), reshape1280, alloc2403) + R.vm.kill_object(reshape1280) + gv3253: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1281: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2403, gv3253, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2403) + gv3254: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1282: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1281, gv3254, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1281) + model_decoder_layers_24_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1079] + model_decoder_layers_24_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1080] + gv3255: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2404: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3255, R.dtype("float16")) + _2403: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_24_encoder_attn_out_proj_weight4, reshape1282, model_decoder_layers_24_encoder_attn_out_proj_bias4, alloc2404) + R.vm.kill_object(reshape1282) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_24_encoder_attn_out_proj_bias4) + gv3256: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2405: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3256, R.dtype("float16")) + cls.add5(alloc2400, alloc2404, alloc2405) + R.vm.kill_object(alloc2400) + R.vm.kill_object(alloc2404) + model_decoder_layers_24_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1087] + model_decoder_layers_24_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1088] + gv3257: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2406: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3257, R.dtype("float16")) + cls.layer_norm2(alloc2405, model_decoder_layers_24_final_layer_norm_weight4, model_decoder_layers_24_final_layer_norm_bias4, alloc2406) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_24_final_layer_norm_bias4) + model_decoder_layers_24_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1083] + model_decoder_layers_24_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1084] + gv3258: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2407: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3258, R.dtype("float16")) + _2406: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_24_fc1_weight4, alloc2406, model_decoder_layers_24_fc1_bias4, alloc2407) + R.vm.kill_object(alloc2406) + R.vm.kill_object(model_decoder_layers_24_fc1_weight4) + R.vm.kill_object(model_decoder_layers_24_fc1_bias4) + model_decoder_layers_24_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1085] + model_decoder_layers_24_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1086] + gv3259: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2408: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3259, R.dtype("float16")) + _2407: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_24_fc2_weight4, alloc2407, model_decoder_layers_24_fc2_bias4, alloc2408) + R.vm.kill_object(alloc2407) + R.vm.kill_object(model_decoder_layers_24_fc2_weight4) + R.vm.kill_object(model_decoder_layers_24_fc2_bias4) + gv3260: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2409: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3260, R.dtype("float16")) + cls.add5(alloc2405, alloc2408, alloc2409) + R.vm.kill_object(alloc2405) + R.vm.kill_object(alloc2408) + model_decoder_layers_25_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1096] + model_decoder_layers_25_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1097] + gv3261: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2410: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3261, R.dtype("float16")) + cls.layer_norm2(alloc2409, model_decoder_layers_25_self_attn_layer_norm_weight4, model_decoder_layers_25_self_attn_layer_norm_bias4, alloc2410) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_25_self_attn_layer_norm_bias4) + model_decoder_layers_25_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1092] + model_decoder_layers_25_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1093] + gv3262: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2411: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3262, R.dtype("float16")) + _2410: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_q_proj_weight4, alloc2410, model_decoder_layers_25_self_attn_q_proj_bias4, alloc2411) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_25_self_attn_q_proj_bias4) + gv3263: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1283: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2411, gv3263, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2411) + model_decoder_layers_25_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1089] + gv3264: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2412: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3264, R.dtype("float16")) + _2411: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_25_self_attn_k_proj_weight4, alloc2410, alloc2412) + R.vm.kill_object(model_decoder_layers_25_self_attn_k_proj_weight4) + gv3265: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1284: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2412, gv3265, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2412) + model_decoder_layers_25_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1090] + model_decoder_layers_25_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1091] + gv3266: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2413: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3266, R.dtype("float16")) + _2412: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_v_proj_weight4, alloc2410, model_decoder_layers_25_self_attn_v_proj_bias4, alloc2413) + R.vm.kill_object(alloc2410) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_25_self_attn_v_proj_bias4) + gv3267: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1285: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2413, gv3267, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2413) + gv3268: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2414: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3268, R.dtype("float16")) + cls.concatenate1(reshape1283, reshape1284, reshape1285, alloc2414) + R.vm.kill_object(reshape1283) + R.vm.kill_object(reshape1284) + R.vm.kill_object(reshape1285) + gv3269: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1286: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2414, gv3269, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2414) + gv3270: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2415: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3270, R.dtype("float16")) + _2414: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape1286, alloc2415) + R.vm.kill_object(reshape1286) + gv3271: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1287: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2415, gv3271, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2415) + gv3272: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1288: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1287, gv3272, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1287) + model_decoder_layers_25_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1094] + model_decoder_layers_25_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1095] + gv3273: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2416: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3273, R.dtype("float16")) + _2415: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_self_attn_out_proj_weight4, reshape1288, model_decoder_layers_25_self_attn_out_proj_bias4, alloc2416) + R.vm.kill_object(reshape1288) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_25_self_attn_out_proj_bias4) + gv3274: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2417: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3274, R.dtype("float16")) + cls.add5(alloc2409, alloc2416, alloc2417) + R.vm.kill_object(alloc2409) + R.vm.kill_object(alloc2416) + model_decoder_layers_25_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1105] + model_decoder_layers_25_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1106] + gv3275: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2418: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3275, R.dtype("float16")) + cls.layer_norm2(alloc2417, model_decoder_layers_25_encoder_attn_layer_norm_weight4, model_decoder_layers_25_encoder_attn_layer_norm_bias4, alloc2418) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_layer_norm_bias4) + model_decoder_layers_25_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1101] + model_decoder_layers_25_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1102] + gv3276: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2419: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3276, R.dtype("float16")) + _2418: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_q_proj_weight4, alloc2418, model_decoder_layers_25_encoder_attn_q_proj_bias4, alloc2419) + R.vm.kill_object(alloc2418) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_q_proj_bias4) + gv3277: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1289: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2419, gv3277, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2419) + gv3278: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1290: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1289, gv3278, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1289) + gv3279: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2420: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3279, R.dtype("float16")) + _2419: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(25), R.prim_value(T.float32(1)), reshape1290, alloc2420) + R.vm.kill_object(reshape1290) + gv3280: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1291: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2420, gv3280, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2420) + gv3281: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1292: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1291, gv3281, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1291) + model_decoder_layers_25_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1103] + model_decoder_layers_25_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1104] + gv3282: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2421: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3282, R.dtype("float16")) + _2420: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_25_encoder_attn_out_proj_weight4, reshape1292, model_decoder_layers_25_encoder_attn_out_proj_bias4, alloc2421) + R.vm.kill_object(reshape1292) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_25_encoder_attn_out_proj_bias4) + gv3283: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2422: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3283, R.dtype("float16")) + cls.add5(alloc2417, alloc2421, alloc2422) + R.vm.kill_object(alloc2417) + R.vm.kill_object(alloc2421) + model_decoder_layers_25_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1111] + model_decoder_layers_25_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1112] + gv3284: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2423: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3284, R.dtype("float16")) + cls.layer_norm2(alloc2422, model_decoder_layers_25_final_layer_norm_weight4, model_decoder_layers_25_final_layer_norm_bias4, alloc2423) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_25_final_layer_norm_bias4) + model_decoder_layers_25_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1107] + model_decoder_layers_25_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1108] + gv3285: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2424: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3285, R.dtype("float16")) + _2423: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_25_fc1_weight4, alloc2423, model_decoder_layers_25_fc1_bias4, alloc2424) + R.vm.kill_object(alloc2423) + R.vm.kill_object(model_decoder_layers_25_fc1_weight4) + R.vm.kill_object(model_decoder_layers_25_fc1_bias4) + model_decoder_layers_25_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1109] + model_decoder_layers_25_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1110] + gv3286: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2425: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3286, R.dtype("float16")) + _2424: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_25_fc2_weight4, alloc2424, model_decoder_layers_25_fc2_bias4, alloc2425) + R.vm.kill_object(alloc2424) + R.vm.kill_object(model_decoder_layers_25_fc2_weight4) + R.vm.kill_object(model_decoder_layers_25_fc2_bias4) + gv3287: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2426: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3287, R.dtype("float16")) + cls.add5(alloc2422, alloc2425, alloc2426) + R.vm.kill_object(alloc2422) + R.vm.kill_object(alloc2425) + model_decoder_layers_26_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1120] + model_decoder_layers_26_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1121] + gv3288: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2427: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3288, R.dtype("float16")) + cls.layer_norm2(alloc2426, model_decoder_layers_26_self_attn_layer_norm_weight4, model_decoder_layers_26_self_attn_layer_norm_bias4, alloc2427) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_26_self_attn_layer_norm_bias4) + model_decoder_layers_26_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1116] + model_decoder_layers_26_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1117] + gv3289: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2428: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3289, R.dtype("float16")) + _2427: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_q_proj_weight4, alloc2427, model_decoder_layers_26_self_attn_q_proj_bias4, alloc2428) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_26_self_attn_q_proj_bias4) + gv3290: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1293: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2428, gv3290, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2428) + model_decoder_layers_26_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1113] + gv3291: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2429: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3291, R.dtype("float16")) + _2428: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_26_self_attn_k_proj_weight4, alloc2427, alloc2429) + R.vm.kill_object(model_decoder_layers_26_self_attn_k_proj_weight4) + gv3292: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1294: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2429, gv3292, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2429) + model_decoder_layers_26_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1114] + model_decoder_layers_26_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1115] + gv3293: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2430: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3293, R.dtype("float16")) + _2429: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_v_proj_weight4, alloc2427, model_decoder_layers_26_self_attn_v_proj_bias4, alloc2430) + R.vm.kill_object(alloc2427) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_26_self_attn_v_proj_bias4) + gv3294: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1295: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2430, gv3294, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2430) + gv3295: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2431: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3295, R.dtype("float16")) + cls.concatenate1(reshape1293, reshape1294, reshape1295, alloc2431) + R.vm.kill_object(reshape1293) + R.vm.kill_object(reshape1294) + R.vm.kill_object(reshape1295) + gv3296: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1296: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2431, gv3296, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2431) + gv3297: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2432: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3297, R.dtype("float16")) + _2431: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape1296, alloc2432) + R.vm.kill_object(reshape1296) + gv3298: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1297: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2432, gv3298, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2432) + gv3299: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1298: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1297, gv3299, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1297) + model_decoder_layers_26_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1118] + model_decoder_layers_26_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1119] + gv3300: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2433: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3300, R.dtype("float16")) + _2432: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_self_attn_out_proj_weight4, reshape1298, model_decoder_layers_26_self_attn_out_proj_bias4, alloc2433) + R.vm.kill_object(reshape1298) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_26_self_attn_out_proj_bias4) + gv3301: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2434: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3301, R.dtype("float16")) + cls.add5(alloc2426, alloc2433, alloc2434) + R.vm.kill_object(alloc2426) + R.vm.kill_object(alloc2433) + model_decoder_layers_26_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1129] + model_decoder_layers_26_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1130] + gv3302: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2435: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3302, R.dtype("float16")) + cls.layer_norm2(alloc2434, model_decoder_layers_26_encoder_attn_layer_norm_weight4, model_decoder_layers_26_encoder_attn_layer_norm_bias4, alloc2435) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_layer_norm_bias4) + model_decoder_layers_26_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1125] + model_decoder_layers_26_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1126] + gv3303: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2436: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3303, R.dtype("float16")) + _2435: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_q_proj_weight4, alloc2435, model_decoder_layers_26_encoder_attn_q_proj_bias4, alloc2436) + R.vm.kill_object(alloc2435) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_q_proj_bias4) + gv3304: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1299: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2436, gv3304, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2436) + gv3305: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1300: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1299, gv3305, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1299) + gv3306: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2437: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3306, R.dtype("float16")) + _2436: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(26), R.prim_value(T.float32(1)), reshape1300, alloc2437) + R.vm.kill_object(reshape1300) + gv3307: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1301: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2437, gv3307, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2437) + gv3308: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1302: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1301, gv3308, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1301) + model_decoder_layers_26_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1127] + model_decoder_layers_26_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1128] + gv3309: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2438: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3309, R.dtype("float16")) + _2437: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_26_encoder_attn_out_proj_weight4, reshape1302, model_decoder_layers_26_encoder_attn_out_proj_bias4, alloc2438) + R.vm.kill_object(reshape1302) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_26_encoder_attn_out_proj_bias4) + gv3310: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2439: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3310, R.dtype("float16")) + cls.add5(alloc2434, alloc2438, alloc2439) + R.vm.kill_object(alloc2434) + R.vm.kill_object(alloc2438) + model_decoder_layers_26_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1135] + model_decoder_layers_26_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1136] + gv3311: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2440: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3311, R.dtype("float16")) + cls.layer_norm2(alloc2439, model_decoder_layers_26_final_layer_norm_weight4, model_decoder_layers_26_final_layer_norm_bias4, alloc2440) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_26_final_layer_norm_bias4) + model_decoder_layers_26_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1131] + model_decoder_layers_26_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1132] + gv3312: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2441: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3312, R.dtype("float16")) + _2440: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_26_fc1_weight4, alloc2440, model_decoder_layers_26_fc1_bias4, alloc2441) + R.vm.kill_object(alloc2440) + R.vm.kill_object(model_decoder_layers_26_fc1_weight4) + R.vm.kill_object(model_decoder_layers_26_fc1_bias4) + model_decoder_layers_26_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1133] + model_decoder_layers_26_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1134] + gv3313: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2442: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3313, R.dtype("float16")) + _2441: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_26_fc2_weight4, alloc2441, model_decoder_layers_26_fc2_bias4, alloc2442) + R.vm.kill_object(alloc2441) + R.vm.kill_object(model_decoder_layers_26_fc2_weight4) + R.vm.kill_object(model_decoder_layers_26_fc2_bias4) + gv3314: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2443: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3314, R.dtype("float16")) + cls.add5(alloc2439, alloc2442, alloc2443) + R.vm.kill_object(alloc2439) + R.vm.kill_object(alloc2442) + model_decoder_layers_27_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1144] + model_decoder_layers_27_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1145] + gv3315: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2444: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3315, R.dtype("float16")) + cls.layer_norm2(alloc2443, model_decoder_layers_27_self_attn_layer_norm_weight4, model_decoder_layers_27_self_attn_layer_norm_bias4, alloc2444) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_27_self_attn_layer_norm_bias4) + model_decoder_layers_27_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1140] + model_decoder_layers_27_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1141] + gv3316: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2445: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3316, R.dtype("float16")) + _2444: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_q_proj_weight4, alloc2444, model_decoder_layers_27_self_attn_q_proj_bias4, alloc2445) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_27_self_attn_q_proj_bias4) + gv3317: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1303: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2445, gv3317, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2445) + model_decoder_layers_27_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1137] + gv3318: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2446: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3318, R.dtype("float16")) + _2445: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_27_self_attn_k_proj_weight4, alloc2444, alloc2446) + R.vm.kill_object(model_decoder_layers_27_self_attn_k_proj_weight4) + gv3319: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1304: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2446, gv3319, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2446) + model_decoder_layers_27_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1138] + model_decoder_layers_27_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1139] + gv3320: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2447: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3320, R.dtype("float16")) + _2446: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_v_proj_weight4, alloc2444, model_decoder_layers_27_self_attn_v_proj_bias4, alloc2447) + R.vm.kill_object(alloc2444) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_27_self_attn_v_proj_bias4) + gv3321: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1305: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2447, gv3321, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2447) + gv3322: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2448: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3322, R.dtype("float16")) + cls.concatenate1(reshape1303, reshape1304, reshape1305, alloc2448) + R.vm.kill_object(reshape1303) + R.vm.kill_object(reshape1304) + R.vm.kill_object(reshape1305) + gv3323: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1306: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2448, gv3323, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2448) + gv3324: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2449: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3324, R.dtype("float16")) + _2448: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape1306, alloc2449) + R.vm.kill_object(reshape1306) + gv3325: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1307: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2449, gv3325, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2449) + gv3326: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1308: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1307, gv3326, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1307) + model_decoder_layers_27_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1142] + model_decoder_layers_27_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1143] + gv3327: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2450: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3327, R.dtype("float16")) + _2449: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_self_attn_out_proj_weight4, reshape1308, model_decoder_layers_27_self_attn_out_proj_bias4, alloc2450) + R.vm.kill_object(reshape1308) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_27_self_attn_out_proj_bias4) + gv3328: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2451: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3328, R.dtype("float16")) + cls.add5(alloc2443, alloc2450, alloc2451) + R.vm.kill_object(alloc2443) + R.vm.kill_object(alloc2450) + model_decoder_layers_27_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1153] + model_decoder_layers_27_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1154] + gv3329: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2452: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3329, R.dtype("float16")) + cls.layer_norm2(alloc2451, model_decoder_layers_27_encoder_attn_layer_norm_weight4, model_decoder_layers_27_encoder_attn_layer_norm_bias4, alloc2452) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_layer_norm_bias4) + model_decoder_layers_27_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1149] + model_decoder_layers_27_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1150] + gv3330: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2453: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3330, R.dtype("float16")) + _2452: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_q_proj_weight4, alloc2452, model_decoder_layers_27_encoder_attn_q_proj_bias4, alloc2453) + R.vm.kill_object(alloc2452) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_q_proj_bias4) + gv3331: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1309: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2453, gv3331, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2453) + gv3332: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1310: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1309, gv3332, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1309) + gv3333: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2454: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3333, R.dtype("float16")) + _2453: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(27), R.prim_value(T.float32(1)), reshape1310, alloc2454) + R.vm.kill_object(reshape1310) + gv3334: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1311: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2454, gv3334, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2454) + gv3335: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1312: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1311, gv3335, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1311) + model_decoder_layers_27_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1151] + model_decoder_layers_27_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1152] + gv3336: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2455: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3336, R.dtype("float16")) + _2454: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_27_encoder_attn_out_proj_weight4, reshape1312, model_decoder_layers_27_encoder_attn_out_proj_bias4, alloc2455) + R.vm.kill_object(reshape1312) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_27_encoder_attn_out_proj_bias4) + gv3337: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2456: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3337, R.dtype("float16")) + cls.add5(alloc2451, alloc2455, alloc2456) + R.vm.kill_object(alloc2451) + R.vm.kill_object(alloc2455) + model_decoder_layers_27_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1159] + model_decoder_layers_27_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1160] + gv3338: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2457: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3338, R.dtype("float16")) + cls.layer_norm2(alloc2456, model_decoder_layers_27_final_layer_norm_weight4, model_decoder_layers_27_final_layer_norm_bias4, alloc2457) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_27_final_layer_norm_bias4) + model_decoder_layers_27_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1155] + model_decoder_layers_27_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1156] + gv3339: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2458: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3339, R.dtype("float16")) + _2457: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_27_fc1_weight4, alloc2457, model_decoder_layers_27_fc1_bias4, alloc2458) + R.vm.kill_object(alloc2457) + R.vm.kill_object(model_decoder_layers_27_fc1_weight4) + R.vm.kill_object(model_decoder_layers_27_fc1_bias4) + model_decoder_layers_27_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1157] + model_decoder_layers_27_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1158] + gv3340: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2459: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3340, R.dtype("float16")) + _2458: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_27_fc2_weight4, alloc2458, model_decoder_layers_27_fc2_bias4, alloc2459) + R.vm.kill_object(alloc2458) + R.vm.kill_object(model_decoder_layers_27_fc2_weight4) + R.vm.kill_object(model_decoder_layers_27_fc2_bias4) + gv3341: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2460: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3341, R.dtype("float16")) + cls.add5(alloc2456, alloc2459, alloc2460) + R.vm.kill_object(alloc2456) + R.vm.kill_object(alloc2459) + model_decoder_layers_28_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1168] + model_decoder_layers_28_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1169] + gv3342: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2461: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3342, R.dtype("float16")) + cls.layer_norm2(alloc2460, model_decoder_layers_28_self_attn_layer_norm_weight4, model_decoder_layers_28_self_attn_layer_norm_bias4, alloc2461) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_28_self_attn_layer_norm_bias4) + model_decoder_layers_28_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1164] + model_decoder_layers_28_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1165] + gv3343: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2462: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3343, R.dtype("float16")) + _2461: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_q_proj_weight4, alloc2461, model_decoder_layers_28_self_attn_q_proj_bias4, alloc2462) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_28_self_attn_q_proj_bias4) + gv3344: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1313: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2462, gv3344, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2462) + model_decoder_layers_28_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1161] + gv3345: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2463: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3345, R.dtype("float16")) + _2462: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_28_self_attn_k_proj_weight4, alloc2461, alloc2463) + R.vm.kill_object(model_decoder_layers_28_self_attn_k_proj_weight4) + gv3346: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1314: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2463, gv3346, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2463) + model_decoder_layers_28_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1162] + model_decoder_layers_28_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1163] + gv3347: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2464: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3347, R.dtype("float16")) + _2463: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_v_proj_weight4, alloc2461, model_decoder_layers_28_self_attn_v_proj_bias4, alloc2464) + R.vm.kill_object(alloc2461) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_28_self_attn_v_proj_bias4) + gv3348: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1315: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2464, gv3348, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2464) + gv3349: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2465: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3349, R.dtype("float16")) + cls.concatenate1(reshape1313, reshape1314, reshape1315, alloc2465) + R.vm.kill_object(reshape1313) + R.vm.kill_object(reshape1314) + R.vm.kill_object(reshape1315) + gv3350: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1316: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2465, gv3350, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2465) + gv3351: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2466: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3351, R.dtype("float16")) + _2465: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape1316, alloc2466) + R.vm.kill_object(reshape1316) + gv3352: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1317: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2466, gv3352, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2466) + gv3353: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1318: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1317, gv3353, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1317) + model_decoder_layers_28_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1166] + model_decoder_layers_28_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1167] + gv3354: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2467: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3354, R.dtype("float16")) + _2466: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_self_attn_out_proj_weight4, reshape1318, model_decoder_layers_28_self_attn_out_proj_bias4, alloc2467) + R.vm.kill_object(reshape1318) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_28_self_attn_out_proj_bias4) + gv3355: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2468: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3355, R.dtype("float16")) + cls.add5(alloc2460, alloc2467, alloc2468) + R.vm.kill_object(alloc2460) + R.vm.kill_object(alloc2467) + model_decoder_layers_28_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1177] + model_decoder_layers_28_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1178] + gv3356: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2469: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3356, R.dtype("float16")) + cls.layer_norm2(alloc2468, model_decoder_layers_28_encoder_attn_layer_norm_weight4, model_decoder_layers_28_encoder_attn_layer_norm_bias4, alloc2469) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_layer_norm_bias4) + model_decoder_layers_28_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1173] + model_decoder_layers_28_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1174] + gv3357: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2470: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3357, R.dtype("float16")) + _2469: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_q_proj_weight4, alloc2469, model_decoder_layers_28_encoder_attn_q_proj_bias4, alloc2470) + R.vm.kill_object(alloc2469) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_q_proj_bias4) + gv3358: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1319: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2470, gv3358, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2470) + gv3359: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1320: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1319, gv3359, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1319) + gv3360: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2471: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3360, R.dtype("float16")) + _2470: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(28), R.prim_value(T.float32(1)), reshape1320, alloc2471) + R.vm.kill_object(reshape1320) + gv3361: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1321: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2471, gv3361, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2471) + gv3362: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1322: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1321, gv3362, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1321) + model_decoder_layers_28_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1175] + model_decoder_layers_28_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1176] + gv3363: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2472: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3363, R.dtype("float16")) + _2471: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_28_encoder_attn_out_proj_weight4, reshape1322, model_decoder_layers_28_encoder_attn_out_proj_bias4, alloc2472) + R.vm.kill_object(reshape1322) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_28_encoder_attn_out_proj_bias4) + gv3364: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2473: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3364, R.dtype("float16")) + cls.add5(alloc2468, alloc2472, alloc2473) + R.vm.kill_object(alloc2468) + R.vm.kill_object(alloc2472) + model_decoder_layers_28_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1183] + model_decoder_layers_28_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1184] + gv3365: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2474: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3365, R.dtype("float16")) + cls.layer_norm2(alloc2473, model_decoder_layers_28_final_layer_norm_weight4, model_decoder_layers_28_final_layer_norm_bias4, alloc2474) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_28_final_layer_norm_bias4) + model_decoder_layers_28_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1179] + model_decoder_layers_28_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1180] + gv3366: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2475: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3366, R.dtype("float16")) + _2474: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_28_fc1_weight4, alloc2474, model_decoder_layers_28_fc1_bias4, alloc2475) + R.vm.kill_object(alloc2474) + R.vm.kill_object(model_decoder_layers_28_fc1_weight4) + R.vm.kill_object(model_decoder_layers_28_fc1_bias4) + model_decoder_layers_28_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1181] + model_decoder_layers_28_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1182] + gv3367: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2476: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3367, R.dtype("float16")) + _2475: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_28_fc2_weight4, alloc2475, model_decoder_layers_28_fc2_bias4, alloc2476) + R.vm.kill_object(alloc2475) + R.vm.kill_object(model_decoder_layers_28_fc2_weight4) + R.vm.kill_object(model_decoder_layers_28_fc2_bias4) + gv3368: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2477: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3368, R.dtype("float16")) + cls.add5(alloc2473, alloc2476, alloc2477) + R.vm.kill_object(alloc2473) + R.vm.kill_object(alloc2476) + model_decoder_layers_29_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1192] + model_decoder_layers_29_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1193] + gv3369: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2478: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3369, R.dtype("float16")) + cls.layer_norm2(alloc2477, model_decoder_layers_29_self_attn_layer_norm_weight4, model_decoder_layers_29_self_attn_layer_norm_bias4, alloc2478) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_29_self_attn_layer_norm_bias4) + model_decoder_layers_29_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1188] + model_decoder_layers_29_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1189] + gv3370: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2479: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3370, R.dtype("float16")) + _2478: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_q_proj_weight4, alloc2478, model_decoder_layers_29_self_attn_q_proj_bias4, alloc2479) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_29_self_attn_q_proj_bias4) + gv3371: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1323: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2479, gv3371, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2479) + model_decoder_layers_29_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1185] + gv3372: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2480: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3372, R.dtype("float16")) + _2479: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_29_self_attn_k_proj_weight4, alloc2478, alloc2480) + R.vm.kill_object(model_decoder_layers_29_self_attn_k_proj_weight4) + gv3373: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1324: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2480, gv3373, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2480) + model_decoder_layers_29_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1186] + model_decoder_layers_29_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1187] + gv3374: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2481: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3374, R.dtype("float16")) + _2480: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_v_proj_weight4, alloc2478, model_decoder_layers_29_self_attn_v_proj_bias4, alloc2481) + R.vm.kill_object(alloc2478) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_29_self_attn_v_proj_bias4) + gv3375: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1325: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2481, gv3375, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2481) + gv3376: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2482: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3376, R.dtype("float16")) + cls.concatenate1(reshape1323, reshape1324, reshape1325, alloc2482) + R.vm.kill_object(reshape1323) + R.vm.kill_object(reshape1324) + R.vm.kill_object(reshape1325) + gv3377: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1326: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2482, gv3377, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2482) + gv3378: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2483: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3378, R.dtype("float16")) + _2482: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1326, alloc2483) + R.vm.kill_object(reshape1326) + gv3379: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1327: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2483, gv3379, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2483) + gv3380: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1328: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1327, gv3380, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1327) + model_decoder_layers_29_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1190] + model_decoder_layers_29_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1191] + gv3381: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2484: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3381, R.dtype("float16")) + _2483: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_self_attn_out_proj_weight4, reshape1328, model_decoder_layers_29_self_attn_out_proj_bias4, alloc2484) + R.vm.kill_object(reshape1328) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_29_self_attn_out_proj_bias4) + gv3382: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2485: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3382, R.dtype("float16")) + cls.add5(alloc2477, alloc2484, alloc2485) + R.vm.kill_object(alloc2477) + R.vm.kill_object(alloc2484) + model_decoder_layers_29_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1201] + model_decoder_layers_29_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1202] + gv3383: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2486: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3383, R.dtype("float16")) + cls.layer_norm2(alloc2485, model_decoder_layers_29_encoder_attn_layer_norm_weight4, model_decoder_layers_29_encoder_attn_layer_norm_bias4, alloc2486) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_layer_norm_bias4) + model_decoder_layers_29_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1197] + model_decoder_layers_29_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1198] + gv3384: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2487: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3384, R.dtype("float16")) + _2486: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_q_proj_weight4, alloc2486, model_decoder_layers_29_encoder_attn_q_proj_bias4, alloc2487) + R.vm.kill_object(alloc2486) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_q_proj_bias4) + gv3385: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1329: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2487, gv3385, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2487) + gv3386: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1330: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1329, gv3386, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1329) + gv3387: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2488: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3387, R.dtype("float16")) + _2487: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(29), R.prim_value(T.float32(1)), reshape1330, alloc2488) + R.vm.kill_object(reshape1330) + gv3388: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1331: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2488, gv3388, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2488) + gv3389: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1332: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1331, gv3389, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1331) + model_decoder_layers_29_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1199] + model_decoder_layers_29_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1200] + gv3390: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2489: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3390, R.dtype("float16")) + _2488: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_29_encoder_attn_out_proj_weight4, reshape1332, model_decoder_layers_29_encoder_attn_out_proj_bias4, alloc2489) + R.vm.kill_object(reshape1332) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_29_encoder_attn_out_proj_bias4) + gv3391: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2490: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3391, R.dtype("float16")) + cls.add5(alloc2485, alloc2489, alloc2490) + R.vm.kill_object(alloc2485) + R.vm.kill_object(alloc2489) + model_decoder_layers_29_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1207] + model_decoder_layers_29_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1208] + gv3392: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2491: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3392, R.dtype("float16")) + cls.layer_norm2(alloc2490, model_decoder_layers_29_final_layer_norm_weight4, model_decoder_layers_29_final_layer_norm_bias4, alloc2491) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_29_final_layer_norm_bias4) + model_decoder_layers_29_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1203] + model_decoder_layers_29_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1204] + gv3393: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2492: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3393, R.dtype("float16")) + _2491: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_29_fc1_weight4, alloc2491, model_decoder_layers_29_fc1_bias4, alloc2492) + R.vm.kill_object(alloc2491) + R.vm.kill_object(model_decoder_layers_29_fc1_weight4) + R.vm.kill_object(model_decoder_layers_29_fc1_bias4) + model_decoder_layers_29_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1205] + model_decoder_layers_29_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1206] + gv3394: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2493: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3394, R.dtype("float16")) + _2492: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_29_fc2_weight4, alloc2492, model_decoder_layers_29_fc2_bias4, alloc2493) + R.vm.kill_object(alloc2492) + R.vm.kill_object(model_decoder_layers_29_fc2_weight4) + R.vm.kill_object(model_decoder_layers_29_fc2_bias4) + gv3395: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2494: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3395, R.dtype("float16")) + cls.add5(alloc2490, alloc2493, alloc2494) + R.vm.kill_object(alloc2490) + R.vm.kill_object(alloc2493) + model_decoder_layers_30_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1216] + model_decoder_layers_30_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1217] + gv3396: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2495: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3396, R.dtype("float16")) + cls.layer_norm2(alloc2494, model_decoder_layers_30_self_attn_layer_norm_weight4, model_decoder_layers_30_self_attn_layer_norm_bias4, alloc2495) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_30_self_attn_layer_norm_bias4) + model_decoder_layers_30_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1212] + model_decoder_layers_30_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1213] + gv3397: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2496: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3397, R.dtype("float16")) + _2495: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_q_proj_weight4, alloc2495, model_decoder_layers_30_self_attn_q_proj_bias4, alloc2496) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_30_self_attn_q_proj_bias4) + gv3398: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1333: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2496, gv3398, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2496) + model_decoder_layers_30_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1209] + gv3399: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2497: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3399, R.dtype("float16")) + _2496: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_30_self_attn_k_proj_weight4, alloc2495, alloc2497) + R.vm.kill_object(model_decoder_layers_30_self_attn_k_proj_weight4) + gv3400: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1334: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2497, gv3400, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2497) + model_decoder_layers_30_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1210] + model_decoder_layers_30_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1211] + gv3401: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2498: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3401, R.dtype("float16")) + _2497: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_v_proj_weight4, alloc2495, model_decoder_layers_30_self_attn_v_proj_bias4, alloc2498) + R.vm.kill_object(alloc2495) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_30_self_attn_v_proj_bias4) + gv3402: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1335: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2498, gv3402, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2498) + gv3403: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2499: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3403, R.dtype("float16")) + cls.concatenate1(reshape1333, reshape1334, reshape1335, alloc2499) + R.vm.kill_object(reshape1333) + R.vm.kill_object(reshape1334) + R.vm.kill_object(reshape1335) + gv3404: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1336: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2499, gv3404, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2499) + gv3405: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2500: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3405, R.dtype("float16")) + _2499: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1336, alloc2500) + R.vm.kill_object(reshape1336) + gv3406: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1337: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2500, gv3406, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2500) + gv3407: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1338: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1337, gv3407, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1337) + model_decoder_layers_30_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1214] + model_decoder_layers_30_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1215] + gv3408: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2501: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3408, R.dtype("float16")) + _2500: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_self_attn_out_proj_weight4, reshape1338, model_decoder_layers_30_self_attn_out_proj_bias4, alloc2501) + R.vm.kill_object(reshape1338) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_30_self_attn_out_proj_bias4) + gv3409: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2502: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3409, R.dtype("float16")) + cls.add5(alloc2494, alloc2501, alloc2502) + R.vm.kill_object(alloc2494) + R.vm.kill_object(alloc2501) + model_decoder_layers_30_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1225] + model_decoder_layers_30_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1226] + gv3410: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2503: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3410, R.dtype("float16")) + cls.layer_norm2(alloc2502, model_decoder_layers_30_encoder_attn_layer_norm_weight4, model_decoder_layers_30_encoder_attn_layer_norm_bias4, alloc2503) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_layer_norm_bias4) + model_decoder_layers_30_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1221] + model_decoder_layers_30_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1222] + gv3411: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2504: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3411, R.dtype("float16")) + _2503: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_q_proj_weight4, alloc2503, model_decoder_layers_30_encoder_attn_q_proj_bias4, alloc2504) + R.vm.kill_object(alloc2503) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_q_proj_bias4) + gv3412: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1339: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2504, gv3412, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2504) + gv3413: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1340: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1339, gv3413, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1339) + gv3414: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2505: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3414, R.dtype("float16")) + _2504: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(30), R.prim_value(T.float32(1)), reshape1340, alloc2505) + R.vm.kill_object(reshape1340) + gv3415: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1341: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2505, gv3415, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2505) + gv3416: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1342: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1341, gv3416, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1341) + model_decoder_layers_30_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1223] + model_decoder_layers_30_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1224] + gv3417: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2506: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3417, R.dtype("float16")) + _2505: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_30_encoder_attn_out_proj_weight4, reshape1342, model_decoder_layers_30_encoder_attn_out_proj_bias4, alloc2506) + R.vm.kill_object(reshape1342) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_30_encoder_attn_out_proj_bias4) + gv3418: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2507: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3418, R.dtype("float16")) + cls.add5(alloc2502, alloc2506, alloc2507) + R.vm.kill_object(alloc2502) + R.vm.kill_object(alloc2506) + model_decoder_layers_30_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1231] + model_decoder_layers_30_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1232] + gv3419: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2508: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3419, R.dtype("float16")) + cls.layer_norm2(alloc2507, model_decoder_layers_30_final_layer_norm_weight4, model_decoder_layers_30_final_layer_norm_bias4, alloc2508) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_30_final_layer_norm_bias4) + model_decoder_layers_30_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1227] + model_decoder_layers_30_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1228] + gv3420: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2509: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3420, R.dtype("float16")) + _2508: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_30_fc1_weight4, alloc2508, model_decoder_layers_30_fc1_bias4, alloc2509) + R.vm.kill_object(alloc2508) + R.vm.kill_object(model_decoder_layers_30_fc1_weight4) + R.vm.kill_object(model_decoder_layers_30_fc1_bias4) + model_decoder_layers_30_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1229] + model_decoder_layers_30_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1230] + gv3421: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2510: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3421, R.dtype("float16")) + _2509: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_30_fc2_weight4, alloc2509, model_decoder_layers_30_fc2_bias4, alloc2510) + R.vm.kill_object(alloc2509) + R.vm.kill_object(model_decoder_layers_30_fc2_weight4) + R.vm.kill_object(model_decoder_layers_30_fc2_bias4) + gv3422: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2511: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3422, R.dtype("float16")) + cls.add5(alloc2507, alloc2510, alloc2511) + R.vm.kill_object(alloc2507) + R.vm.kill_object(alloc2510) + model_decoder_layers_31_self_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1240] + model_decoder_layers_31_self_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1241] + gv3423: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2512: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3423, R.dtype("float16")) + cls.layer_norm2(alloc2511, model_decoder_layers_31_self_attn_layer_norm_weight4, model_decoder_layers_31_self_attn_layer_norm_bias4, alloc2512) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_31_self_attn_layer_norm_bias4) + model_decoder_layers_31_self_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1236] + model_decoder_layers_31_self_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1237] + gv3424: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2513: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3424, R.dtype("float16")) + _2512: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_q_proj_weight4, alloc2512, model_decoder_layers_31_self_attn_q_proj_bias4, alloc2513) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_31_self_attn_q_proj_bias4) + gv3425: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1343: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2513, gv3425, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2513) + model_decoder_layers_31_self_attn_k_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1233] + gv3426: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2514: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3426, R.dtype("float16")) + _2513: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul1_cublas", model_decoder_layers_31_self_attn_k_proj_weight4, alloc2512, alloc2514) + R.vm.kill_object(model_decoder_layers_31_self_attn_k_proj_weight4) + gv3427: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1344: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2514, gv3427, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2514) + model_decoder_layers_31_self_attn_v_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1234] + model_decoder_layers_31_self_attn_v_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1235] + gv3428: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2515: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3428, R.dtype("float16")) + _2514: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_v_proj_weight4, alloc2512, model_decoder_layers_31_self_attn_v_proj_bias4, alloc2515) + R.vm.kill_object(alloc2512) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_weight4) + R.vm.kill_object(model_decoder_layers_31_self_attn_v_proj_bias4) + gv3429: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1345: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2515, gv3429, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2515) + gv3430: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + alloc2516: R.Tensor(dtype="float16", ndim=4) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3430, R.dtype("float16")) + cls.concatenate1(reshape1343, reshape1344, reshape1345, alloc2516) + R.vm.kill_object(reshape1343) + R.vm.kill_object(reshape1344) + R.vm.kill_object(reshape1345) + gv3431: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(60), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1346: R.Tensor((seq_len, 60, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2516, gv3431, sinfo_args=(R.Tensor((seq_len, 60, 64), dtype="float16"),)) + R.vm.kill_object(alloc2516) + gv3432: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2517: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3432, R.dtype("float16")) + _2516: R.Object = R.call_packed("vm.builtin.attention_kv_cache_attention_with_fused_qkv", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1346, alloc2517) + R.vm.kill_object(reshape1346) + gv3433: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1347: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2517, gv3433, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2517) + gv3434: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1348: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1347, gv3434, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1347) + model_decoder_layers_31_self_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1238] + model_decoder_layers_31_self_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1239] + gv3435: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2518: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3435, R.dtype("float16")) + _2517: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_self_attn_out_proj_weight4, reshape1348, model_decoder_layers_31_self_attn_out_proj_bias4, alloc2518) + R.vm.kill_object(reshape1348) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_31_self_attn_out_proj_bias4) + gv3436: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2519: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3436, R.dtype("float16")) + cls.add5(alloc2511, alloc2518, alloc2519) + R.vm.kill_object(alloc2511) + R.vm.kill_object(alloc2518) + model_decoder_layers_31_encoder_attn_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1249] + model_decoder_layers_31_encoder_attn_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1250] + gv3437: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2520: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3437, R.dtype("float16")) + cls.layer_norm2(alloc2519, model_decoder_layers_31_encoder_attn_layer_norm_weight4, model_decoder_layers_31_encoder_attn_layer_norm_bias4, alloc2520) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_layer_norm_bias4) + model_decoder_layers_31_encoder_attn_q_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1245] + model_decoder_layers_31_encoder_attn_q_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1246] + gv3438: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2521: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3438, R.dtype("float16")) + _2520: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_q_proj_weight4, alloc2520, model_decoder_layers_31_encoder_attn_q_proj_bias4, alloc2521) + R.vm.kill_object(alloc2520) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_weight4) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_q_proj_bias4) + gv3439: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1349: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2521, gv3439, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2521) + gv3440: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + reshape1350: R.Tensor((seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1349, gv3440, sinfo_args=(R.Tensor((seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(reshape1349) + gv3441: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=3),)) + alloc2522: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3441, R.dtype("float16")) + _2521: R.Object = R.call_packed("vm.builtin.attention_kv_cache_cross_attention", paged_kv_cache, R.prim_value(31), R.prim_value(T.float32(1)), reshape1350, alloc2522) + R.vm.kill_object(reshape1350) + gv3442: R.Shape(ndim=4) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(4), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(20), R.prim_value(0), R.prim_value(64), sinfo_args=(R.Shape(ndim=4),)) + reshape1351: R.Tensor((1, seq_len, 20, 64), dtype="float16") = R.call_packed("vm.builtin.reshape", alloc2522, gv3442, sinfo_args=(R.Tensor((1, seq_len, 20, 64), dtype="float16"),)) + R.vm.kill_object(alloc2522) + gv3443: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + reshape1352: R.Tensor((1, seq_len, 1280), dtype="float16") = R.call_packed("vm.builtin.reshape", reshape1351, gv3443, sinfo_args=(R.Tensor((1, seq_len, 1280), dtype="float16"),)) + R.vm.kill_object(reshape1351) + model_decoder_layers_31_encoder_attn_out_proj_weight4: R.Tensor((1280, 1280), dtype="float16") = packed_params[1247] + model_decoder_layers_31_encoder_attn_out_proj_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1248] + gv3444: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2523: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3444, R.dtype("float16")) + _2522: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add1_cublas", model_decoder_layers_31_encoder_attn_out_proj_weight4, reshape1352, model_decoder_layers_31_encoder_attn_out_proj_bias4, alloc2523) + R.vm.kill_object(reshape1352) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_weight4) + R.vm.kill_object(model_decoder_layers_31_encoder_attn_out_proj_bias4) + gv3445: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2524: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage39, R.prim_value(0), gv3445, R.dtype("float16")) + R.vm.kill_object(storage39) + cls.add5(alloc2519, alloc2523, alloc2524) + R.vm.kill_object(alloc2519) + R.vm.kill_object(alloc2523) + model_decoder_layers_31_final_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1255] + model_decoder_layers_31_final_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1256] + gv3446: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2525: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3446, R.dtype("float16")) + cls.layer_norm2(alloc2524, model_decoder_layers_31_final_layer_norm_weight4, model_decoder_layers_31_final_layer_norm_bias4, alloc2525) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_weight4) + R.vm.kill_object(model_decoder_layers_31_final_layer_norm_bias4) + model_decoder_layers_31_fc1_weight4: R.Tensor((5120, 1280), dtype="float16") = packed_params[1251] + model_decoder_layers_31_fc1_bias4: R.Tensor((5120,), dtype="float16") = packed_params[1252] + gv3447: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(5120), sinfo_args=(R.Shape(ndim=3),)) + alloc2526: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage37, R.prim_value(0), gv3447, R.dtype("float16")) + R.vm.kill_object(storage37) + _2525: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add_relax_nn_gelu_cublas", model_decoder_layers_31_fc1_weight4, alloc2525, model_decoder_layers_31_fc1_bias4, alloc2526) + R.vm.kill_object(alloc2525) + R.vm.kill_object(model_decoder_layers_31_fc1_weight4) + R.vm.kill_object(model_decoder_layers_31_fc1_bias4) + model_decoder_layers_31_fc2_weight4: R.Tensor((1280, 5120), dtype="float16") = packed_params[1253] + model_decoder_layers_31_fc2_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1254] + gv3448: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2527: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage38, R.prim_value(0), gv3448, R.dtype("float16")) + R.vm.kill_object(storage38) + _2526: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul_relax_add2_cublas", model_decoder_layers_31_fc2_weight4, alloc2526, model_decoder_layers_31_fc2_bias4, alloc2527) + R.vm.kill_object(alloc2526) + R.vm.kill_object(model_decoder_layers_31_fc2_weight4) + R.vm.kill_object(model_decoder_layers_31_fc2_bias4) + gv3449: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2528: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage40, R.prim_value(0), gv3449, R.dtype("float16")) + R.vm.kill_object(storage40) + cls.add5(alloc2524, alloc2527, alloc2528) + R.vm.kill_object(alloc2524) + R.vm.kill_object(alloc2527) + model_decoder_layer_norm_weight4: R.Tensor((1280,), dtype="float16") = packed_params[1257] + model_decoder_layer_norm_bias4: R.Tensor((1280,), dtype="float16") = packed_params[1258] + gv3450: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1280), sinfo_args=(R.Shape(ndim=3),)) + alloc2529: R.Tensor(dtype="float16", ndim=3) = R.vm.alloc_tensor(storage41, R.prim_value(0), gv3450, R.dtype("float16")) + R.vm.kill_object(storage41) + cls.layer_norm2(alloc2528, model_decoder_layer_norm_weight4, model_decoder_layer_norm_bias4, alloc2529) + R.vm.kill_object(alloc2528) + R.vm.kill_object(model_decoder_layer_norm_weight4) + R.vm.kill_object(model_decoder_layer_norm_bias4) + storage42: R.Object = R.vm.alloc_storage(R.shape([2560]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc2530: R.Tensor((1, 1, 1280), dtype="float16") = R.vm.alloc_tensor(storage42, R.prim_value(0), R.shape([1, 1, 1280]), R.dtype("float16")) + R.vm.kill_object(storage42) + cls.index(alloc2529, alloc2530) + R.vm.kill_object(alloc2529) + storage: R.Object = R.vm.alloc_storage(R.shape([207464]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + alloc2531: R.Tensor((1, 1, 51866), dtype="float32") = R.vm.alloc_tensor(storage, R.prim_value(0), R.shape([1, 1, 51866]), R.dtype("float32")) + R.vm.kill_object(storage) + _2530: R.Object = R.call_packed("fused_relax_permute_dims_relax_matmul2_cublas", model_decoder_embed_tokens_weight4, alloc2530, alloc2531) + R.vm.kill_object(model_decoder_embed_tokens_weight4) + R.vm.kill_object(alloc2530) + return alloc2531 + + @R.function + def renormalize_by_top_p(probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), top_p: R.Tensor(("batch_size",), dtype="float32"), init_pivots: R.Tensor(("batch_size", 3), dtype="float32")) -> R.Tensor(("batch_size", "vocab_size"), dtype="float32"): + batch_size = T.int64() + vocab_size = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", top_p, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[1], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", init_pivots, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[2], param=init_pivots, annotation=R.Tensor((batch_size, 3), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[0], param=probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", top_p, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[1], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", init_pivots, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(0), R.prim_value(3), R.str("ErrorContext(fn=renormalize_by_top_p, loc=param[2], param=init_pivots, annotation=R.Tensor((batch_size, 3), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + cls.shape_func4(shape_heap) + storage43: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv3451: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) + alloc2532: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage43, R.prim_value(0), gv3451, R.dtype("float32")) + R.vm.kill_object(storage43) + storage44: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv3452: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(0), sinfo_args=(R.Shape(ndim=1),)) + alloc2533: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage44, R.prim_value(0), gv3452, R.dtype("float32")) + R.vm.kill_object(storage44) + cls.top_p_pivot_cutoff(probs, top_p, init_pivots, alloc2532, alloc2533) + lv6: R.Tuple(R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="float32", ndim=1)) = alloc2532, alloc2533 + gv3453: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) + storage45: R.Object = R.vm.alloc_storage(gv3453, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv3454: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc2534: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage45, R.prim_value(0), gv3454, R.dtype("float32")) + R.vm.kill_object(storage45) + cls.top_p_renorm_after_cutoff(probs, alloc2532, alloc2533, alloc2534) + R.vm.kill_object(alloc2532) + R.vm.kill_object(alloc2533) + R.call_packed("vm.builtin.match_shape", alloc2534, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=renormalize_by_top_p, loc=return, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + return alloc2534 + + @R.function + def sample_with_top_p(sorted_probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), sorted_indices: R.Tensor(("batch_size", "vocab_size"), dtype="int32"), uniform_samples: R.Tensor(("num_samples",), dtype="float32"), sample_indices: R.Tensor(("num_samples",), dtype="int32"), top_p: R.Tensor(("batch_size",), dtype="float32")) -> R.Tensor(("num_samples",), dtype="int32"): + num_samples = T.int64() + batch_size = T.int64() + vocab_size = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(6),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", sorted_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[0], param=sorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", sorted_indices, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[2], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[3], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", top_p, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sample_with_top_p, loc=param[4], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", sorted_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sample_with_top_p, loc=param[0], param=sorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", sorted_indices, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sample_with_top_p, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sample_with_top_p, loc=param[2], param=uniform_samples, annotation=R.Tensor((num_samples,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sample_with_top_p, loc=param[3], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", top_p, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sample_with_top_p, loc=param[4], param=top_p, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + cls.shape_func3(shape_heap) + gv2568: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + uniform_samples1: R.Tensor((num_samples, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", uniform_samples, gv2568, sinfo_args=(R.Tensor((num_samples, 1), dtype="float32"),)) + gv2569: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + sample_indices1: R.Tensor((num_samples, 1), dtype="int32") = R.call_packed("vm.builtin.reshape", sample_indices, gv2569, sinfo_args=(R.Tensor((num_samples, 1), dtype="int32"),)) + gv2570: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + sample_indices2: R.Tensor((batch_size, 1), dtype="float32") = R.call_packed("vm.builtin.reshape", top_p, gv2570, sinfo_args=(R.Tensor((batch_size, 1), dtype="float32"),)) + storage33: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2571: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc1978: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage33, R.prim_value(0), gv2571, R.dtype("int32")) + gv2572: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=1),)) + R.call_packed("vm.builtin.call_tir_dyn", cls.full, alloc1978, gv2572, sinfo_args=(R.Tuple,)) + gv2573: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) + storage34: R.Object = R.vm.alloc_storage(gv2573, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2574: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) + lv1: R.Tensor(dtype="uint8", ndim=1) = R.vm.alloc_tensor(storage34, R.prim_value(0), gv2574, R.dtype("uint8")) + R.vm.kill_object(storage34) + gv2575: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(5), sinfo_args=(R.Shape(ndim=1),)) + storage35: R.Object = R.vm.alloc_storage(gv2575, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2576: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc1979: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage35, R.prim_value(0), gv2576, R.dtype("float32")) + R.vm.kill_object(storage35) + cls.cumsum(sorted_probs, lv1, alloc1979) + R.vm.kill_object(lv1) + storage36: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2577: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc1980: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage36, R.prim_value(0), gv2577, R.dtype("float32")) + R.vm.kill_object(storage36) + cls.get_renorm_prob(alloc1979, sample_indices2, alloc1978, alloc1980) + R.vm.kill_object(sample_indices2) + R.vm.kill_object(alloc1978) + gv2578: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(2), R.prim_value(0), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc1981: R.Tensor(dtype="int32", ndim=2) = R.vm.alloc_tensor(storage33, R.prim_value(0), gv2578, R.dtype("int32")) + R.vm.kill_object(storage33) + cls.get_index_from_sorted(alloc1979, sorted_indices, alloc1980, uniform_samples1, sample_indices1, alloc1981) + R.vm.kill_object(uniform_samples1) + R.vm.kill_object(sample_indices1) + R.vm.kill_object(alloc1979) + R.vm.kill_object(alloc1980) + gv2579: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) + gv2: R.Tensor((num_samples,), dtype="int32") = R.call_packed("vm.builtin.reshape", alloc1981, gv2579, sinfo_args=(R.Tensor((num_samples,), dtype="int32"),)) + R.vm.kill_object(alloc1981) + return gv2 + + @R.function + def sampler_take_probs(unsorted_probs: R.Tensor(("batch_size", "vocab_size"), dtype="float32"), sorted_indices: R.Tensor(("batch_size", "vocab_size"), dtype="int32"), sample_indices: R.Tensor(("num_samples",), dtype="int32"), sampling_result: R.Tensor(("num_samples",), dtype="int32"), lobprob_offsets: R.Tensor(("num_positions",), dtype="int32")) -> R.Tuple(R.Tensor(("num_samples",), dtype="float32"), R.Tensor(("num_positions",), dtype="float32"), R.Tensor(("num_positions",), dtype="int32")): + num_samples = T.int64() + num_positions = T.int64() + batch_size = T.int64() + vocab_size = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(4),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", unsorted_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[0], param=unsorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", sorted_indices, R.prim_value(2), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", sample_indices, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", sampling_result, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[3], param=sampling_result, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", lobprob_offsets, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_take_probs, loc=param[4], param=lobprob_offsets, annotation=R.Tensor((num_positions,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", unsorted_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sampler_take_probs, loc=param[0], param=unsorted_probs, annotation=R.Tensor((batch_size, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", sorted_indices, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sampler_take_probs, loc=param[1], param=sorted_indices, annotation=R.Tensor((batch_size, vocab_size), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", sample_indices, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=param[2], param=sample_indices, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", sampling_result, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=param[3], param=sampling_result, annotation=R.Tensor((num_samples,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", lobprob_offsets, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=param[4], param=lobprob_offsets, annotation=R.Tensor((num_positions,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + storage: R.Object = R.vm.alloc_storage(R.shape([32]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) + alloc: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage, R.prim_value(0), gv, R.dtype("float32")) + R.vm.kill_object(storage) + storage1: R.Object = R.vm.alloc_storage(R.shape([192]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv1: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) + alloc1: R.Tensor(dtype="float32", ndim=1) = R.vm.alloc_tensor(storage1, R.prim_value(0), gv1, R.dtype("float32")) + R.vm.kill_object(storage1) + storage2: R.Object = R.vm.alloc_storage(R.shape([192]), R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv2: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=1),)) + alloc2: R.Tensor(dtype="int32", ndim=1) = R.vm.alloc_tensor(storage2, R.prim_value(0), gv2, R.dtype("int32")) + R.vm.kill_object(storage2) + cls.sampler_take_probs_tir(unsorted_probs, sorted_indices, sample_indices, sampling_result, lobprob_offsets, alloc, alloc1, alloc2) + gv3: R.Tuple(R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="float32", ndim=1), R.Tensor(dtype="int32", ndim=1)) = alloc, alloc1, alloc2 + R.vm.kill_object(alloc) + R.vm.kill_object(alloc1) + R.vm.kill_object(alloc2) + gv3_1: R.Tensor(dtype="float32", ndim=1) = gv3[0] + R.call_packed("vm.builtin.match_shape", gv3_1, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(2), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) + gv4: R.Tensor(dtype="float32", ndim=1) = gv3[1] + R.call_packed("vm.builtin.match_shape", gv4, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) + gv5: R.Tensor(dtype="int32", ndim=1) = gv3[2] + R.call_packed("vm.builtin.match_shape", gv5, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(3), R.str("ErrorContext(fn=sampler_take_probs, loc=return, annotation=R.Tuple(R.Tensor((num_samples,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"float32\"), R.Tensor((num_positions,), dtype=\"int32\"))) "), sinfo_args=(R.Tuple,)) + return gv3 + + @R.function + def sampler_verify_draft_tokens(draft_probs: R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), draft_tokens: R.Tensor(("num_nodes",), dtype="int32"), model_probs: R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), token_tree_first_child: R.Tensor(("num_nodes",), dtype="int32"), token_tree_next_sibling: R.Tensor(("num_nodes",), dtype="int32"), uniform_samples: R.Tensor(("num_nodes",), dtype="float32"), token_tree_parent_ptr: R.Tensor(("nbatch",), dtype="int32")) -> R.Tuple(R.Tensor(("num_nodes", "vocab_size"), dtype="float32"), R.Tensor(("nbatch",), dtype="int32")): + num_nodes = T.int64() + vocab_size = T.int64() + nbatch = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "num_positions": 48, "num_samples": 8}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(3),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", draft_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[0], param=draft_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", draft_tokens, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[1], param=draft_tokens, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", model_probs, R.prim_value(2), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[2], param=model_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", token_tree_first_child, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[3], param=token_tree_first_child, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", token_tree_next_sibling, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[4], param=token_tree_next_sibling, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", uniform_samples, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[5], param=uniform_samples, annotation=R.Tensor((num_nodes,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", token_tree_parent_ptr, R.prim_value(1), R.dtype("int32"), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[6], param=token_tree_parent_ptr, annotation=R.Tensor((nbatch,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", draft_probs, shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[0], param=draft_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", draft_tokens, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[1], param=draft_tokens, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", model_probs, shape_heap, R.prim_value(2), R.prim_value(3), R.prim_value(0), R.prim_value(3), R.prim_value(1), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[2], param=model_probs, annotation=R.Tensor((num_nodes, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", token_tree_first_child, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[3], param=token_tree_first_child, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", token_tree_next_sibling, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[4], param=token_tree_next_sibling, annotation=R.Tensor((num_nodes,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", uniform_samples, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[5], param=uniform_samples, annotation=R.Tensor((num_nodes,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", token_tree_parent_ptr, shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), R.str("ErrorContext(fn=sampler_verify_draft_tokens, loc=param[6], param=token_tree_parent_ptr, annotation=R.Tensor((nbatch,), dtype=\"int32\")) "), sinfo_args=(R.Tuple,)) + cls.batch_verify_on_gpu_single_kernel(draft_probs, draft_tokens, model_probs, token_tree_first_child, token_tree_next_sibling, uniform_samples, token_tree_parent_ptr) + gv4: R.Tuple(R.Tensor((num_nodes, vocab_size), dtype="float32"), R.Tensor((nbatch,), dtype="int32")) = model_probs, token_tree_parent_ptr + return gv4 + + @R.function + def softmax_with_temperature(logits: R.Tensor(("batch_size", 1, "vocab_size"), dtype="float32"), temperature: R.Tensor(("batch_size",), dtype="float32")) -> R.Tensor(("batch_size", 1, "vocab_size"), dtype="float32"): + batch_size = T.int64() + vocab_size = T.int64() + R.func_attr({"relax.force_pure": 1, "tir_non_negative_var": ["vocab_size"], "tir_var_upper_bound": {"batch_size": 8, "seq_len": 15000, "total_seq_len": 1500}}) + cls = Module + shape_heap: R.Tensor(dtype="int64", ndim=1) = R.call_builtin_with_ctx("vm.builtin.alloc_shape_heap", (R.prim_value(5),), sinfo_args=(R.Tensor(dtype="int64", ndim=1),)) + R.call_packed("vm.builtin.check_tensor_info", logits, R.prim_value(3), R.dtype("float32"), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[0], param=logits, annotation=R.Tensor((batch_size, 1, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.check_tensor_info", temperature, R.prim_value(1), R.dtype("float32"), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[1], param=temperature, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", logits, shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[0], param=logits, annotation=R.Tensor((batch_size, 1, vocab_size), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + R.call_packed("vm.builtin.match_shape", temperature, shape_heap, R.prim_value(1), R.prim_value(3), R.prim_value(0), R.str("ErrorContext(fn=softmax_with_temperature, loc=param[1], param=temperature, annotation=R.Tensor((batch_size,), dtype=\"float32\")) "), sinfo_args=(R.Tuple,)) + cls.shape_func5(shape_heap) + gv3455: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + lv: R.Tensor((batch_size, vocab_size), dtype="float32") = R.call_packed("vm.builtin.reshape", logits, gv3455, sinfo_args=(R.Tensor((batch_size, vocab_size), dtype="float32"),)) + gv3456: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) + storage46: R.Object = R.vm.alloc_storage(gv3456, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv3457: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=2),)) + alloc2535: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage46, R.prim_value(0), gv3457, R.dtype("float32")) + R.vm.kill_object(storage46) + gv3458: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(2), sinfo_args=(R.Shape(ndim=1),)) + storage47: R.Object = R.vm.alloc_storage(gv3458, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv3459: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(3), sinfo_args=(R.Shape(ndim=2),)) + alloc2536: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage47, R.prim_value(0), gv3459, R.dtype("float32")) + R.vm.kill_object(storage47) + cls.chunk_lse(lv, temperature, alloc2535, alloc2536) + lv1: R.Tuple(R.Tensor(dtype="float32", ndim=2), R.Tensor(dtype="float32", ndim=2)) = alloc2535, alloc2536 + gv3460: R.Shape(ndim=1) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(1), R.prim_value(1), R.prim_value(4), sinfo_args=(R.Shape(ndim=1),)) + storage48: R.Object = R.vm.alloc_storage(gv3460, R.prim_value(0), R.dtype("uint8"), R.str("global")) + gv3461: R.Shape(ndim=2) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(2), R.prim_value(1), R.prim_value(0), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=2),)) + alloc2537: R.Tensor(dtype="float32", ndim=2) = R.vm.alloc_tensor(storage48, R.prim_value(0), gv3461, R.dtype("float32")) + R.vm.kill_object(storage48) + cls.softmax_with_chunked_sum(lv, temperature, alloc2535, alloc2536, alloc2537) + R.vm.kill_object(lv) + R.vm.kill_object(alloc2535) + R.vm.kill_object(alloc2536) + gv3462: R.Shape(ndim=3) = R.call_packed("vm.builtin.make_shape", shape_heap, R.prim_value(3), R.prim_value(1), R.prim_value(0), R.prim_value(0), R.prim_value(1), R.prim_value(1), R.prim_value(1), sinfo_args=(R.Shape(ndim=3),)) + gv: R.Tensor((batch_size, 1, vocab_size), dtype="float32") = R.call_packed("vm.builtin.reshape", alloc2537, gv3462, sinfo_args=(R.Tensor((batch_size, 1, vocab_size), dtype="float32"),)) + R.vm.kill_object(alloc2537) + return gv + +# Metadata omitted. Use show_meta=True in script() method to show it. \ No newline at end of file