JoshuaChak
/

bmodel-qwen1.5-1.8b

Model card Files Files and versions Community

bmodel-qwen1.5-1.8b / Qwen1_5 /speculative_sample_demo /chat_speculative.cpp

JoshuaChak

Upload folder using huggingface_hub

7c071a8 verified 5 months ago

raw

history blame

34.6 kB

	//===----------------------------------------------------------------------===//
	//
	// Copyright (C) 2023 Sophgo Technologies Inc. All rights reserved.
	//
	// TPU-MLIR is licensed under the 2-Clause BSD License except for the
	// third-party components.
	//
	//===----------------------------------------------------------------------===//

	#include <assert.h>
	#include <getopt.h>
	#include <inttypes.h>
	#include <pybind11/pybind11.h>
	#include <pybind11/stl.h>
	#include <stdio.h>

	#include <algorithm>
	#include <chrono>
	#include <cstdlib>
	#include <functional>
	#include <iostream>
	#include <numeric>
	#include <random>
	#include <vector>

	#include "bmruntime_interface.h"
	#include "memory.h"

	static const int K = 4;
	static const int GUESS_LEN = K + 1;
	static const uint16_t ATTENTION_MASK = 0xF0E2;

	class Qwen {
	public:
	void init(const std::vector<int> &devid, std::string draft_model_path,
	std::string target_model_path);
	void deinit();
	int draft_forward_first(std::vector<int> &tokens);
	int draft_forward_next(int index);
	std::pair<std::vector<float>, std::vector<int>>
	target_forward_first(std::vector<int> &tokens);
	std::pair<std::vector<float>, std::vector<int>> target_forward_next();
	std::vector<int> generate(std::vector<int> &history_tokens, int EOS);

	std::mt19937 sgen;
	Qwen() : sgen(42){};

	private:
	void net_launch(void p_bmrt, const bm_net_info_t net, int stage_idx = 0);
	inline void d2d(bm_device_mem_t &dst, bm_device_mem_t &src);

	void head_launch(void p_bmrt, const bm_net_info_t net,
	bm_device_mem_t &logits_mem);
	int greedy_search(void p_bmrt, const bm_net_info_t net,
	bm_device_mem_t &logits_mem);
	std::pair<std::vector<float>, std::vector<int>>
	penalty_sample(void p_bmrt, const bm_net_info_t net,
	bm_device_mem_t &logits_mem,
	const std::vector<int> &visited_tokens, int token_length);
	std::pair<std::vector<float>, std::vector<int>> batch_penalty_sample(
	void p_bmrt, const bm_net_info_t net, bm_device_mem_t &logits_mem,
	const std::vector<int> &visited_tokens, int token_length);
	void roll_back(std::vector<float> &probs, std::vector<int> &tokens,
	std::vector<float> &prob_history, int index);
	int sample_from_probs(std::vector<float> &probs, std::vector<int> &tokens);
	int verify(std::vector<int> &guess_tokens,
	std::uniform_real_distribution<float> &udist);
	int resample(std::vector<float> &probs, std::vector<int> &tokens,
	int accepted);

	public:
	int SEQLEN; // read from bmodel
	int DRAFT_NUM_LAYERS; // read from bmodel
	int TARGET_NUM_LAYERS; // read from bmodel
	int candidate_num; // read from bmodel
	bool draft_io_alone;
	bool target_io_alone;
	int VOCAB_SIZE;
	std::vector<int> draft_visited_tokens;
	std::vector<int> target_visited_tokens;
	int draft_token_length;
	int target_token_length;
	std::vector<float> draft_prob_history;
	std::vector<float> target_prob_history;

	// generation
	float temperature;
	float top_p;
	float repeat_penalty;
	int repeat_last_n;
	int max_new_tokens;
	std::string generation_mode;
	std::string prompt_mode;

	private:
	std::vector<bm_handle_t> handles;
	bm_handle_t bm_handle;

	void *d_bmrt;
	std::vector<const bm_net_info_t *> draft_net_blocks;
	std::vector<const bm_net_info_t *> draft_net_blocks_cache;
	const bm_net_info_t *draft_net_embed;
	const bm_net_info_t *draft_net_embed_cache;
	const bm_net_info_t draft_net_lm, draft_net_greedy_head,
	*draft_net_penalty_sample_head;
	std::vector<bm_device_mem_t> draft_past_key;
	std::vector<bm_device_mem_t> draft_past_value;

	void *t_bmrt;
	std::vector<const bm_net_info_t *> target_net_blocks;
	std::vector<const bm_net_info_t *> target_net_blocks_cache;
	const bm_net_info_t *target_net_embed;
	const bm_net_info_t *target_net_embed_cache;
	const bm_net_info_t target_net_lm, target_net_greedy_head,
	*target_net_penalty_sample_head;
	std::vector<bm_device_mem_t> target_past_key;
	std::vector<bm_device_mem_t> target_past_value;
	};

	void Qwen::net_launch(void p_bmrt, const bm_net_info_t net, int stage_idx) {
	std::vector<bm_tensor_t> in_tensors(net->input_num);
	std::vector<bm_tensor_t> out_tensors(net->output_num);

	for (int i = 0; i < net->input_num; i++) {
	bmrt_tensor_with_device(
	&in_tensors[i], net->stages[stage_idx].input_mems[i],
	net->input_dtypes[i], net->stages[stage_idx].input_shapes[i]);
	}
	for (int i = 0; i < net->output_num; i++) {
	bmrt_tensor_with_device(
	&out_tensors[i], net->stages[stage_idx].output_mems[i],
	net->output_dtypes[i], net->stages[stage_idx].output_shapes[i]);
	}
	auto ret = bmrt_launch_tensor_ex(p_bmrt, net->name, in_tensors.data(),
	net->input_num, out_tensors.data(),
	net->output_num, true, false);
	assert(ret);
	bm_thread_sync(bm_handle);
	}

	void Qwen::d2d(bm_device_mem_t &dst, bm_device_mem_t &src) {
	bm_memcpy_d2d_byte(bm_handle, dst, 0, src, 0, bm_mem_get_device_size(src));
	}

	void Qwen::init(const std::vector<int> &devices, std::string draft_model_path,
	std::string target_model_path) {
	// request bm_handle
	std::cout << "Device [ ";
	for (auto d : devices) {
	std::cout << d << " ";
	}
	std::cout << "] loading ....\n";
	for (auto d : devices) {
	bm_handle_t h;
	bm_status_t status = bm_dev_request(&h, d);
	assert(BM_SUCCESS == status);
	handles.push_back(h);
	}
	bm_handle = handles[0];

	// create bmruntime
	#ifdef SOC_TARGET
	d_bmrt = bmrt_create(handles[0]);
	t_bmrt = bmrt_create(handles[0]);
	#else
	d_bmrt = bmrt_create_ex(handles.data(), handles.size());
	t_bmrt = bmrt_create_ex(handles.data(), handles.size());
	#endif
	assert(NULL != d_bmrt);
	assert(NULL != t_bmrt);

	// load bmodel by file
	printf("Model[%s] loading ....\n", draft_model_path.c_str());
	assert(true == bmrt_load_bmodel(d_bmrt, draft_model_path.c_str()));

	printf("Model[%s] loading ....\n", target_model_path.c_str());
	assert(true == bmrt_load_bmodel(t_bmrt, target_model_path.c_str()));
	printf("Done!\n");

	// draft net embed and lm_head
	draft_net_embed = bmrt_get_network_info(d_bmrt, "embedding");
	draft_net_embed_cache = bmrt_get_network_info(d_bmrt, "embedding_cache");
	draft_net_lm = bmrt_get_network_info(d_bmrt, "lm_head");
	draft_net_greedy_head = bmrt_get_network_info(d_bmrt, "greedy_head");
	draft_net_penalty_sample_head =
	bmrt_get_network_info(d_bmrt, "penalty_sample_head");
	auto draft_num_nets = bmrt_get_network_number(d_bmrt);
	DRAFT_NUM_LAYERS = (draft_num_nets - 5) / 2;

	// draft net blocks
	for (int i = 0; i < DRAFT_NUM_LAYERS; i++) {
	auto block_name = "block_" + std::to_string(i);
	auto cache_name = "block_cache_" + std::to_string(i);
	draft_net_blocks.emplace_back(
	bmrt_get_network_info(d_bmrt, block_name.c_str()));
	draft_net_blocks_cache.emplace_back(
	bmrt_get_network_info(d_bmrt, cache_name.c_str()));
	}

	// draft kv cache
	draft_past_key.resize(DRAFT_NUM_LAYERS);
	draft_past_value.resize(DRAFT_NUM_LAYERS);
	auto draft_addr_mode = draft_net_blocks_cache[0]->addr_mode;
	draft_io_alone = draft_addr_mode == 1;
	for (int i = 0; i < DRAFT_NUM_LAYERS; i++) {
	assert(draft_addr_mode == draft_net_blocks_cache[i]->addr_mode);
	if (draft_io_alone) {
	draft_past_key[i] = draft_net_blocks_cache[i]->stages[0].input_mems[3];
	draft_past_value[i] = draft_net_blocks_cache[i]->stages[0].input_mems[4];
	} else {
	auto ret =
	bm_malloc_device_byte(bm_handle, &draft_past_key[i],
	draft_net_blocks_cache[i]->max_input_bytes[3]);
	assert(BM_SUCCESS == ret);
	ret =
	bm_malloc_device_byte(bm_handle, &draft_past_value[i],
	draft_net_blocks_cache[i]->max_input_bytes[4]);
	assert(BM_SUCCESS == ret);
	}
	}

	// target net embed and lm_head
	target_net_embed = bmrt_get_network_info(t_bmrt, "embedding");
	target_net_embed_cache = bmrt_get_network_info(t_bmrt, "embedding_cache");
	target_net_lm = bmrt_get_network_info(t_bmrt, "lm_head");
	auto target_num_nets = bmrt_get_network_number(t_bmrt);
	TARGET_NUM_LAYERS = (target_num_nets - 3) / 2;

	// target net blocks
	for (int i = 0; i < TARGET_NUM_LAYERS; i++) {
	auto block_name = "block_" + std::to_string(i);
	auto cache_name = "block_cache_" + std::to_string(i);
	target_net_blocks.emplace_back(
	bmrt_get_network_info(t_bmrt, block_name.c_str()));
	target_net_blocks_cache.emplace_back(
	bmrt_get_network_info(t_bmrt, cache_name.c_str()));
	}

	// target kv cache
	target_past_key.resize(TARGET_NUM_LAYERS);
	target_past_value.resize(TARGET_NUM_LAYERS);
	auto target_addr_mode = target_net_blocks_cache[0]->addr_mode;
	target_io_alone = target_addr_mode == 1;
	for (int i = 0; i < TARGET_NUM_LAYERS; i++) {
	assert(target_addr_mode == target_net_blocks_cache[i]->addr_mode);
	if (target_io_alone) {
	target_past_key[i] = target_net_blocks_cache[i]->stages[0].input_mems[3];
	target_past_value[i] =
	target_net_blocks_cache[i]->stages[0].input_mems[4];
	} else {
	auto ret =
	bm_malloc_device_byte(bm_handle, &target_past_key[i],
	target_net_blocks_cache[i]->max_input_bytes[3]);
	assert(BM_SUCCESS == ret);
	ret =
	bm_malloc_device_byte(bm_handle, &target_past_value[i],
	target_net_blocks_cache[i]->max_input_bytes[4]);
	assert(BM_SUCCESS == ret);
	}
	}

	// resize
	assert(draft_net_embed->stages[0].input_shapes[0].dims[1] ==
	target_net_embed->stages[0].input_shapes[0].dims[1]);
	SEQLEN = draft_net_embed->stages[0].input_shapes[0].dims[1];

	VOCAB_SIZE = draft_net_lm->stages[0].output_shapes[0].dims[1];

	candidate_num =
	draft_net_penalty_sample_head->stages[0].output_shapes[0].dims[1];

	draft_visited_tokens.resize(SEQLEN);
	target_visited_tokens.resize(SEQLEN);
	draft_prob_history.resize(K * VOCAB_SIZE);
	target_prob_history.resize(K * VOCAB_SIZE);
	}

	void Qwen::deinit() {
	if (false == draft_io_alone) {
	for (int i = 0; i < DRAFT_NUM_LAYERS; i++) {
	bm_free_device(bm_handle, draft_past_key[i]);
	bm_free_device(bm_handle, draft_past_value[i]);
	}
	}
	if (false == target_io_alone) {
	for (int i = 0; i < TARGET_NUM_LAYERS; i++) {
	bm_free_device(bm_handle, target_past_key[i]);
	bm_free_device(bm_handle, target_past_value[i]);
	}
	}
	bmrt_destroy(d_bmrt);
	bmrt_destroy(t_bmrt);
	for (auto h : handles) {
	bm_dev_free(h);
	}
	}

	void Qwen::head_launch(void p_bmrt, const bm_net_info_t net,
	bm_device_mem_t &logits_mem) {
	std::vector<bm_tensor_t> in_tensors(net->input_num);
	std::vector<bm_tensor_t> out_tensors(net->output_num);

	bmrt_tensor_with_device(&in_tensors[0], logits_mem, net->input_dtypes[0],
	net->stages[0].input_shapes[0]);

	for (int i = 1; i < net->input_num; i++) {
	bmrt_tensor_with_device(&in_tensors[i], net->stages[0].input_mems[i],
	net->input_dtypes[i],
	net->stages[0].input_shapes[i]);
	}
	for (int i = 0; i < net->output_num; i++) {
	bmrt_tensor_with_device(&out_tensors[i], net->stages[0].output_mems[i],
	net->output_dtypes[i],
	net->stages[0].output_shapes[i]);
	}
	auto ret = bmrt_launch_tensor_ex(p_bmrt, net->name, in_tensors.data(),
	net->input_num, out_tensors.data(),
	net->output_num, true, false);
	assert(ret);
	bm_thread_sync(bm_handle);
	}

	int Qwen::greedy_search(void p_bmrt, const bm_net_info_t net,
	bm_device_mem_t &logits_mem) {
	auto &out_mem = net->stages[0].output_mems[0];
	head_launch(p_bmrt, net, logits_mem);
	int token = 0;
	bm_memcpy_d2s(bm_handle, (void *)&token, out_mem);
	return token;
	}

	std::pair<std::vector<float>, std::vector<int>>
	Qwen::penalty_sample(void p_bmrt, const bm_net_info_t net,
	bm_device_mem_t &logits_mem,
	const std::vector<int> &visited_tokens, int token_length) {
	auto &in1_mem = net->stages[0].input_mems[1];
	auto &in2_mem = net->stages[0].input_mems[2];
	auto &in3_mem = net->stages[0].input_mems[3];
	auto &in4_mem = net->stages[0].input_mems[4];
	auto &out0_mem = net->stages[0].output_mems[0];
	auto &out1_mem = net->stages[0].output_mems[1];

	// repeat_penalty + top_p + top_k + temperature
	std::vector<int> generated_tokens(SEQLEN, visited_tokens[token_length - 1]);
	repeat_last_n = std::min(repeat_last_n, token_length);
	std::copy(visited_tokens.begin() + token_length - repeat_last_n,
	visited_tokens.begin() + token_length, generated_tokens.begin());
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)generated_tokens.data());
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)&top_p);
	bm_memcpy_s2d(bm_handle, in3_mem, (void *)&temperature);
	bm_memcpy_s2d(bm_handle, in4_mem, (void *)&repeat_penalty);

	// inference
	head_launch(p_bmrt, net, logits_mem);

	// get logit & token
	int candidate_num = net->stages[0].output_shapes[0].dims[1];
	std::vector<float> probs(candidate_num);
	bm_memcpy_d2s(bm_handle, probs.data(), out0_mem);
	std::vector<int> tokens(candidate_num);
	bm_memcpy_d2s(bm_handle, tokens.data(), out1_mem);

	return std::make_pair(probs, tokens);
	}

	std::pair<std::vector<float>, std::vector<int>> Qwen::batch_penalty_sample(
	void p_bmrt, const bm_net_info_t net, bm_device_mem_t &logits_mem,
	const std::vector<int> &visited_tokens, int token_length) {
	auto &in1_mem = net->stages[0].input_mems[1];
	auto &in2_mem = net->stages[0].input_mems[2];
	auto &in3_mem = net->stages[0].input_mems[3];
	auto &in4_mem = net->stages[0].input_mems[4];
	auto &out0_mem = net->stages[0].output_mems[0];
	auto &out1_mem = net->stages[0].output_mems[1];

	// repeat_penalty + top_p + top_k + temperature
	std::vector<int> generated_tokens(SEQLEN, visited_tokens[token_length - 1]);
	repeat_last_n = std::min(repeat_last_n, token_length);
	std::copy(visited_tokens.begin() + token_length - repeat_last_n,
	visited_tokens.begin() + token_length, generated_tokens.begin());
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)generated_tokens.data());
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)&top_p);
	bm_memcpy_s2d(bm_handle, in3_mem, (void *)&temperature);
	bm_memcpy_s2d(bm_handle, in4_mem, (void *)&repeat_penalty);

	// inference
	head_launch(p_bmrt, net, logits_mem);

	// get logit & token
	std::vector<float> probs(candidate_num * GUESS_LEN);
	bm_memcpy_d2s(bm_handle, probs.data(), out0_mem);
	std::vector<int> tokens(candidate_num * GUESS_LEN);
	bm_memcpy_d2s(bm_handle, tokens.data(), out1_mem);

	return std::make_pair(probs, tokens);
	}

	void Qwen::roll_back(std::vector<float> &probs, std::vector<int> &tokens,
	std::vector<float> &prob_history, int index) {
	for (size_t i = 0; i < tokens.size(); i++) {
	prob_history[tokens[i] + index * VOCAB_SIZE] = probs[i];
	}
	}

	int Qwen::sample_from_probs(std::vector<float> &probs,
	std::vector<int> &tokens) {
	std::discrete_distribution<> dist(probs.begin(), probs.end());
	return tokens[dist(sgen)];
	}

	//===------------------------------------------------------------===//
	// Draft Model Forward
	//===------------------------------------------------------------===//

	int Qwen::draft_forward_first(std::vector<int> &tokens) {
	std::vector<int> position_id(SEQLEN, 0);
	std::vector<uint16_t> attention_mask(SEQLEN * SEQLEN, ATTENTION_MASK);
	std::copy(tokens.begin(), tokens.end(), draft_visited_tokens.data());

	draft_token_length = tokens.size();

	for (int i = 0; i < draft_token_length; i++) {
	position_id[i] = i;
	}
	for (int i = 0; i < draft_token_length; i++) {
	for (int j = 0; j < SEQLEN; j++) {
	if (j <= i) {
	attention_mask[i * SEQLEN + j] = 0;
	}
	}
	}

	// forward embeding
	auto &in_mem = draft_net_embed->stages[0].input_mems[0];
	auto &out_mem = draft_net_embed->stages[0].output_mems[0];
	bm_memcpy_s2d(bm_handle, in_mem, (void *)draft_visited_tokens.data());
	net_launch(d_bmrt, draft_net_embed); // prefil embedding

	// forward blocks
	for (int idx = 0; idx < DRAFT_NUM_LAYERS; idx++) {
	auto &in0_mem = draft_net_blocks[idx]->stages[0].input_mems[0];
	auto &in1_mem = draft_net_blocks[idx]->stages[0].input_mems[1];
	auto &in2_mem = draft_net_blocks[idx]->stages[0].input_mems[2];
	d2d(in0_mem, out_mem);
	if (idx == 0) {
	// only first time need copy
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)position_id.data());
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)attention_mask.data());
	}
	net_launch(d_bmrt, draft_net_blocks[idx]);
	out_mem = draft_net_blocks[idx]->stages[0].output_mems[0];
	d2d(draft_past_key[idx], draft_net_blocks[idx]->stages[0].output_mems[1]);
	d2d(draft_past_value[idx], draft_net_blocks[idx]->stages[0].output_mems[2]);
	}

	// forward lmhead
	int bytes = out_mem.size / SEQLEN;
	auto &lm_in_mem = draft_net_lm->stages[0].input_mems[0];
	auto &lm_out_mem = draft_net_lm->stages[0].output_mems[0];
	bm_memcpy_d2d_byte(bm_handle, lm_in_mem, 0, out_mem,
	(draft_token_length - 1) * bytes, bytes);
	net_launch(d_bmrt, draft_net_lm);

	auto pair = penalty_sample(d_bmrt, draft_net_penalty_sample_head, lm_out_mem,
	draft_visited_tokens, draft_token_length);

	auto &candidate_probs = pair.first;
	auto &candidate_tokens = pair.second;

	// roll back
	roll_back(candidate_probs, candidate_tokens, draft_prob_history, 0);

	auto token = sample_from_probs(candidate_probs, candidate_tokens);
	draft_visited_tokens[draft_token_length] = token;
	draft_token_length += 1;
	return token;
	}

	int Qwen::draft_forward_next(int index) {
	int cur_token = draft_visited_tokens[draft_token_length - 1];

	std::vector<uint16_t> attention_mask(SEQLEN + 1, 0);
	for (int i = draft_token_length - 1; i < SEQLEN; i++) {
	attention_mask[i] = ATTENTION_MASK;
	}
	int32_t position_id = draft_token_length - 1;
	// embedding
	auto &in_mem = draft_net_embed_cache->stages[0].input_mems[0];
	auto &out_mem = draft_net_embed_cache->stages[0].output_mems[0];
	bm_memcpy_s2d(bm_handle, in_mem, (void *)&cur_token);
	net_launch(d_bmrt, draft_net_embed_cache);

	// blocks
	int bytes = bm_mem_get_device_size(
	draft_net_blocks_cache[0]->stages[0].output_mems[1]);
	int token_offset = (draft_token_length - 1) * bytes;
	for (int idx = 0; idx < DRAFT_NUM_LAYERS; idx++) {
	auto &in0_mem = draft_net_blocks_cache[idx]->stages[0].input_mems[0];
	auto &in1_mem = draft_net_blocks_cache[idx]->stages[0].input_mems[1];
	auto &in2_mem = draft_net_blocks_cache[idx]->stages[0].input_mems[2];
	auto &in3_mem = draft_net_blocks_cache[idx]->stages[0].input_mems[3];
	auto &in4_mem = draft_net_blocks_cache[idx]->stages[0].input_mems[4];
	auto &out0_mem = draft_net_blocks_cache[idx]->stages[0].output_mems[0];
	auto &out1_mem = draft_net_blocks_cache[idx]->stages[0].output_mems[1];
	auto &out2_mem = draft_net_blocks_cache[idx]->stages[0].output_mems[2];
	d2d(in0_mem, out_mem);
	if (draft_io_alone) {
	if (idx == 0) {
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)&position_id);
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)attention_mask.data());
	} else {
	d2d(in1_mem, draft_net_blocks_cache[0]->stages[0].input_mems[1]);
	d2d(in2_mem, draft_net_blocks_cache[0]->stages[0].input_mems[2]);
	}
	} else {
	if (idx == 0) {
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)&position_id);
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)attention_mask.data());
	}
	d2d(in3_mem, draft_past_key[idx]);
	d2d(in4_mem, draft_past_value[idx]);
	}
	net_launch(d_bmrt, draft_net_blocks_cache[idx]);
	out_mem = out0_mem;
	bm_memcpy_d2d_byte(bm_handle, draft_past_key[idx], token_offset, out1_mem,
	0, bytes);
	bm_memcpy_d2d_byte(bm_handle, draft_past_value[idx], token_offset, out2_mem,
	0, bytes);
	}

	// forward lmhead
	auto &lm_in_mem = draft_net_lm->stages[0].input_mems[0];
	auto &lm_out_mem = draft_net_lm->stages[0].output_mems[0];
	d2d(lm_in_mem, out_mem);
	net_launch(d_bmrt, draft_net_lm);

	auto pair = penalty_sample(d_bmrt, draft_net_penalty_sample_head, lm_out_mem,
	draft_visited_tokens, draft_token_length);

	auto &candidate_probs = pair.first;
	auto &candidate_tokens = pair.second;

	// roll back
	roll_back(candidate_probs, candidate_tokens, draft_prob_history, index);

	auto token = sample_from_probs(candidate_probs, candidate_tokens);
	draft_visited_tokens[draft_token_length] = token;
	draft_token_length += 1;
	return token;
	}

	//===------------------------------------------------------------===//
	// Target Model Forward
	//===------------------------------------------------------------===//

	std::pair<std::vector<float>, std::vector<int>>
	Qwen::target_forward_first(std::vector<int> &tokens) {
	std::vector<int> position_id(SEQLEN, 0);
	std::vector<uint16_t> attention_mask(SEQLEN * SEQLEN, ATTENTION_MASK);
	std::copy(tokens.begin(), tokens.end(), target_visited_tokens.data());

	target_token_length = tokens.size();

	for (int i = 0; i < target_token_length; i++) {
	position_id[i] = i;
	}
	for (int i = 0; i < target_token_length; i++) {
	for (int j = 0; j < SEQLEN; j++) {
	if (j <= i) {
	attention_mask[i * SEQLEN + j] = 0;
	}
	}
	}

	// forward embeding
	auto &in_mem = target_net_embed->stages[0].input_mems[0];
	auto &out_mem = target_net_embed->stages[0].output_mems[0];
	bm_memcpy_s2d(bm_handle, in_mem, (void *)target_visited_tokens.data());
	net_launch(t_bmrt, target_net_embed); // prefil embedding

	// forward blocks
	for (int idx = 0; idx < TARGET_NUM_LAYERS; idx++) {
	auto &in0_mem = target_net_blocks[idx]->stages[0].input_mems[0];
	auto &in1_mem = target_net_blocks[idx]->stages[0].input_mems[1];
	auto &in2_mem = target_net_blocks[idx]->stages[0].input_mems[2];
	d2d(in0_mem, out_mem);
	if (idx == 0) {
	// only first time need copy
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)position_id.data());
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)attention_mask.data());
	}
	net_launch(t_bmrt, target_net_blocks[idx]);
	out_mem = target_net_blocks[idx]->stages[0].output_mems[0];
	d2d(target_past_key[idx], target_net_blocks[idx]->stages[0].output_mems[1]);
	d2d(target_past_value[idx],
	target_net_blocks[idx]->stages[0].output_mems[2]);
	}

	// forward lmhead
	int bytes = out_mem.size / SEQLEN;
	auto &lm_in0_mem = target_net_lm->stages[0].input_mems[0];
	auto &lm_in1_mem = target_net_lm->stages[0].input_mems[1];
	auto &lm_in2_mem = target_net_lm->stages[0].input_mems[2];
	auto &lm_in3_mem = target_net_lm->stages[0].input_mems[3];
	auto &lm_in4_mem = target_net_lm->stages[0].input_mems[4];
	auto &lm_out0_mem = target_net_lm->stages[0].output_mems[0];
	auto &lm_out1_mem = target_net_lm->stages[0].output_mems[1];

	// repeat_penalty + top_p + top_k + temperature
	bm_memcpy_d2d_byte(bm_handle, lm_in0_mem, 0, out_mem,
	(target_token_length - GUESS_LEN) * bytes,
	GUESS_LEN * bytes);
	std::vector<int> generated_tokens(SEQLEN, target_visited_tokens[target_token_length - 1]);
	repeat_last_n = std::min(repeat_last_n, target_token_length);
	std::copy(target_visited_tokens.begin() + target_token_length - repeat_last_n,
	target_visited_tokens.begin() + target_token_length, generated_tokens.begin());
	bm_memcpy_s2d(bm_handle, lm_in1_mem, (void *)generated_tokens.data());
	bm_memcpy_s2d(bm_handle, lm_in2_mem, (void *)&top_p);
	bm_memcpy_s2d(bm_handle, lm_in3_mem, (void *)&temperature);
	bm_memcpy_s2d(bm_handle, lm_in4_mem, (void *)&repeat_penalty);

	// inference
	net_launch(t_bmrt, target_net_lm);

	// get logit & token
	std::vector<float> batch_probs(candidate_num * GUESS_LEN);
	bm_memcpy_d2s(bm_handle, batch_probs.data(), lm_out0_mem);
	std::vector<int> batch_tokens(candidate_num * GUESS_LEN);
	bm_memcpy_d2s(bm_handle, batch_tokens.data(), lm_out1_mem);


	for (int i = 0; i < K; i++) {
	std::vector<float> candidate_probs(batch_probs.begin() + i * candidate_num,
	batch_probs.begin() +
	(i + 1) * candidate_num);
	std::vector<int> candidate_tokens(batch_tokens.begin() + i * candidate_num,
	batch_tokens.begin() +
	(i + 1) * candidate_num);

	roll_back(candidate_probs, candidate_tokens, target_prob_history, i);
	}

	target_token_length += 1;
	return std::make_pair(batch_probs, batch_tokens);
	}

	std::pair<std::vector<float>, std::vector<int>> Qwen::target_forward_next() {
	std::vector<int> cur_tokens(
	target_visited_tokens.begin() + target_token_length - GUESS_LEN,
	target_visited_tokens.begin() + target_token_length);
	std::vector<uint16_t> attention_mask((GUESS_LEN) * (SEQLEN + GUESS_LEN),
	ATTENTION_MASK);
	std::vector<int> position_ids(GUESS_LEN, 0);

	for (int i = 0; i < GUESS_LEN; i++) {
	for (int j = 0; j < target_token_length - GUESS_LEN; j++) {
	attention_mask[i * (SEQLEN + GUESS_LEN) + j] = 0;
	}
	for (int j = SEQLEN; j < SEQLEN + i + 1; j++) {
	attention_mask[i * (SEQLEN + GUESS_LEN) + j] = 0;
	}
	position_ids[i] = target_token_length + i - GUESS_LEN;
	}

	// embedding
	auto &in_mem = target_net_embed_cache->stages[0].input_mems[0];
	auto &out_mem = target_net_embed_cache->stages[0].output_mems[0];
	bm_memcpy_s2d(bm_handle, in_mem, (void *)cur_tokens.data());
	net_launch(t_bmrt, target_net_embed_cache);

	// blocks
	int bytes = bm_mem_get_device_size(
	target_net_blocks_cache[0]->stages[0].output_mems[1]) /
	GUESS_LEN;
	int token_offset = (target_token_length - GUESS_LEN) * bytes;
	for (int idx = 0; idx < TARGET_NUM_LAYERS; idx++) {
	auto &in0_mem = target_net_blocks_cache[idx]->stages[0].input_mems[0];
	auto &in1_mem = target_net_blocks_cache[idx]->stages[0].input_mems[1];
	auto &in2_mem = target_net_blocks_cache[idx]->stages[0].input_mems[2];
	auto &in3_mem = target_net_blocks_cache[idx]->stages[0].input_mems[3];
	auto &in4_mem = target_net_blocks_cache[idx]->stages[0].input_mems[4];
	auto &out0_mem = target_net_blocks_cache[idx]->stages[0].output_mems[0];
	auto &out1_mem = target_net_blocks_cache[idx]->stages[0].output_mems[1];
	auto &out2_mem = target_net_blocks_cache[idx]->stages[0].output_mems[2];
	d2d(in0_mem, out_mem);
	if (target_io_alone) {
	if (idx == 0) {
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)position_ids.data());
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)attention_mask.data());
	} else {
	d2d(in1_mem, target_net_blocks_cache[0]->stages[0].input_mems[1]);
	d2d(in2_mem, target_net_blocks_cache[0]->stages[0].input_mems[2]);
	}
	} else {
	if (idx == 0) {
	bm_memcpy_s2d(bm_handle, in1_mem, (void *)position_ids.data());
	bm_memcpy_s2d(bm_handle, in2_mem, (void *)attention_mask.data());
	}
	d2d(in3_mem, target_past_key[idx]);
	d2d(in4_mem, target_past_value[idx]);
	}
	net_launch(t_bmrt, target_net_blocks_cache[idx]);
	out_mem = out0_mem;
	bm_memcpy_d2d_byte(bm_handle, target_past_key[idx], token_offset, out1_mem,
	0, GUESS_LEN * bytes);
	bm_memcpy_d2d_byte(bm_handle, target_past_value[idx], token_offset,
	out2_mem, 0, GUESS_LEN * bytes);
	}

	// forward lmhead
	auto &lm_in0_mem = target_net_lm->stages[0].input_mems[0];
	auto &lm_in1_mem = target_net_lm->stages[0].input_mems[1];
	auto &lm_in2_mem = target_net_lm->stages[0].input_mems[2];
	auto &lm_in3_mem = target_net_lm->stages[0].input_mems[3];
	auto &lm_in4_mem = target_net_lm->stages[0].input_mems[4];
	auto &lm_out0_mem = target_net_lm->stages[0].output_mems[0];
	auto &lm_out1_mem = target_net_lm->stages[0].output_mems[1];

	// repeat_penalty + top_p + top_k + temperature
	d2d(lm_in0_mem, out_mem);
	std::vector<int> generated_tokens(SEQLEN, target_visited_tokens[target_token_length - 1]);
	repeat_last_n = std::min(repeat_last_n, target_token_length);
	std::copy(target_visited_tokens.begin() + target_token_length - repeat_last_n,
	target_visited_tokens.begin() + target_token_length, generated_tokens.begin());
	bm_memcpy_s2d(bm_handle, lm_in1_mem, (void *)generated_tokens.data());
	bm_memcpy_s2d(bm_handle, lm_in2_mem, (void *)&top_p);
	bm_memcpy_s2d(bm_handle, lm_in3_mem, (void *)&temperature);
	bm_memcpy_s2d(bm_handle, lm_in4_mem, (void *)&repeat_penalty);

	// inference
	net_launch(t_bmrt, target_net_lm);

	// get logit & token
	std::vector<float> batch_probs(candidate_num * GUESS_LEN);
	bm_memcpy_d2s(bm_handle, batch_probs.data(), lm_out0_mem);
	std::vector<int> batch_tokens(candidate_num * GUESS_LEN);
	bm_memcpy_d2s(bm_handle, batch_tokens.data(), lm_out1_mem);


	for (int i = 0; i < K; i++) {
	std::vector<float> candidate_probs(batch_probs.begin() + i * candidate_num,
	batch_probs.begin() +
	(i + 1) * candidate_num);
	std::vector<int> candidate_tokens(batch_tokens.begin() + i * candidate_num,
	batch_tokens.begin() +
	(i + 1) * candidate_num);

	roll_back(candidate_probs, candidate_tokens, target_prob_history, i);
	}

	target_token_length += 1;
	return std::make_pair(batch_probs, batch_tokens);
	}

	int Qwen::verify(std::vector<int> &guess_tokens,
	std::uniform_real_distribution<float> &udist) {
	int accepted = 0;
	for (size_t i = 0; i < K; i++) {
	float randomValue = udist(sgen);
	if (randomValue >
	target_prob_history[guess_tokens[i] + VOCAB_SIZE * i] /
	draft_prob_history[guess_tokens[i] + VOCAB_SIZE * i]) {
	break;
	}
	accepted += 1;
	}
	return accepted;
	}

	int Qwen::resample(std::vector<float> &probs, std::vector<int> &tokens,
	int accepted) {
	std::vector<float> modified_probs(candidate_num, 0);
	std::vector<int> modified_tokens(tokens.begin() + accepted * candidate_num,
	tokens.begin() +
	(accepted + 1) * candidate_num);
	if (accepted == K) {
	for (int i = 0; i < candidate_num; i++) {
	modified_probs[i] =
	probs[accepted * candidate_num + i] -
	draft_prob_history[tokens[accepted * candidate_num + i] +
	accepted * VOCAB_SIZE];
	}
	draft_forward_next(0); // important !!!
	} else {
	std::copy(probs.begin() + accepted * candidate_num,
	probs.begin() + (accepted + 1) * candidate_num,
	modified_probs.begin());
	}
	return sample_from_probs(modified_probs, modified_tokens);
	}

	std::vector<int> Qwen::generate(std::vector<int> &history_tokens, int EOS) {
	if (history_tokens.empty()) {
	printf("Sorry: your question is empty!!\n");
	history_tokens.clear();
	return {};
	}

	// make sure token not too large
	if ((int)history_tokens.size() > SEQLEN - 10) {
	history_tokens.clear();
	printf("Error: your question is too large!\n");
	return {};
	}

	int accepted = 0;
	std::vector<int> guess_tokens;
	std::vector<int> result_tokens;
	std::uniform_real_distribution<float> udist(0.0f, 1.0f);

	// 1. Prefill
	// draft_model forward K
	guess_tokens.emplace_back(draft_forward_first(history_tokens));
	for (int i = 1; i < K; i++) {
	guess_tokens.emplace_back(draft_forward_next(i));
	}

	// target_model forward
	std::vector<int> target_tokens(history_tokens);
	target_tokens.insert(target_tokens.end(), guess_tokens.begin(),
	guess_tokens.end());
	auto pair = target_forward_first(target_tokens);

	// Verify
	accepted = verify(guess_tokens, udist);

	for (int i = 0; i < accepted; i++) {
	result_tokens.emplace_back(guess_tokens[i]);
	}

	// Resample
	int last_token = resample(pair.first, pair.second, accepted);
	result_tokens.emplace_back(last_token);

	// 2. Decode
	while (std::find(result_tokens.end() - GUESS_LEN, result_tokens.end(), EOS) ==
	result_tokens.end() &&
	result_tokens.size() < SEQLEN - history_tokens.size() - 10) {
	guess_tokens.clear();
	draft_prob_history.clear();
	target_prob_history.clear();

	// draft model forward
	draft_token_length = history_tokens.size() + result_tokens.size();
	draft_visited_tokens[draft_token_length - 1] = last_token;
	for (int i = 0; i < K; i++) {
	guess_tokens.emplace_back(draft_forward_next(i));
	}

	// target model forward
	target_token_length = draft_token_length;
	target_visited_tokens = draft_visited_tokens;
	pair = target_forward_next();

	// verfiy
	accepted = verify(guess_tokens, udist);
	for (int i = 0; i < accepted; i++) {
	result_tokens.emplace_back(guess_tokens[i]);
	}

	// resample
	last_token = resample(pair.first, pair.second, accepted);
	result_tokens.emplace_back(last_token);
	}

	return result_tokens;
	}

	PYBIND11_MODULE(chat_speculative, m) {
	pybind11::class_<Qwen>(m, "Qwen")
	.def(pybind11::init<>())
	.def("init", &Qwen::init)
	// .def("forward_first", &Qwen::forward_first)
	// .def("forward_next", &Qwen::forward_next)
	.def("generate", &Qwen::generate)
	.def("deinit", &Qwen::deinit)
	.def_readwrite("SEQLEN", &Qwen::SEQLEN) // read SEQLEN in pipeline.py
	// .def_readwrite("token_length", &Qwen::token_length)
	.def_readwrite("temperature", &Qwen::temperature)
	.def_readwrite("top_p", &Qwen::top_p)
	.def_readwrite("repeat_penalty", &Qwen::repeat_penalty)
	.def_readwrite("repeat_last_n", &Qwen::repeat_last_n)
	.def_readwrite("max_new_tokens", &Qwen::max_new_tokens)
	.def_readwrite("generation_mode", &Qwen::generation_mode)
	.def_readwrite("prompt_mode", &Qwen::prompt_mode);
	}