import os from transformers import AutoTokenizer from vllm import LLM, SamplingParams from huggingface_hub import snapshot_download model_path = "happzy2633/qwen2.5-7b-ins-v3" tokenizer = AutoTokenizer.from_pretrained(model_path) sampling_params = SamplingParams(temperature=0.7, top_p=0.8, repetition_penalty=1.05, max_tokens=8192) llm = LLM(model=model_path) def api_call_batch(batch_messages): text_list = [ tokenizer.apply_chat_template(conversation=messages, tokenize=False, add_generation_prompt=True, return_tensors='pt') for messages in batch_messages ] outputs = llm.generate(text_list, sampling_params) result = [output.outputs[0].text for output in outputs] return result def api_call(messages): return api_call_batch([messages])[0] def call_gpt(history, prompt): return api_call(history+[{"role":"user", "content":prompt}]) if __name__ == "__main__": messages = [{"role":"user", "content":"你是谁?"}] breakpoint() print(api_call_batch([messages]*4))