Qwen1_5/speculative_sample_demo/README.md · JoshuaChak/bmodel-qwen1.5-1.8b at 3f12f51ff3b55aecb39e4d9ece4d384896b48da3

cp ../compile/files/Qwen1.5-1.8B-Chat/modeling_qwen2.py /usr/local/lib/python3.10/dist-packages/transformers/models/qwen2/

python export_onnx.py -m ../compile/Qwen1.5-1.8B-Chat/ -d cpu

python export_pt.py -m ../compile/Qwen1.5-7B-Chat/ -d cpu
./compile_pt.sh --mode int4 --name qwen1.5-7b --addr_mode io_alone --guess_len 5 --seq_length 512

mkdir build
cd build/ && cmake .. && make -j  && cp *cpython* .. && cd ..

python3 -m dfss [email protected]:/ext_model_information/LLM/LLM-TPU/qwen1.5-1.8b_int4_1dev.bmodel
python3 -m dfss [email protected]:/ext_model_information/LLM/LLM-TPU/qwen1.5-1.8b_int4_1dev.bmodel

draft_model = qwen1.5_0.5b_int4
target_model = qwen1.5_7b_int4
测试完成481个token用时40.94239926338196 s，提升20%

完成26个token用时2.04833984375 s，提升30%