|
--- |
|
license: mit |
|
--- |
|
How to use with vllm: |
|
``` |
|
from vllm import LLM, SamplingParams |
|
inputs = [ |
|
"Who is the president of US?", |
|
"Can you speak Indonesian?" |
|
] |
|
# Initialize the LLM model |
|
llm = LLM(model="jester6136/Phi-3.5-mini-instruct-awq", |
|
quantization="AWQ", |
|
gpu_memory_utilization=0.9, |
|
max_model_len=2000, |
|
max_num_seqs=32) |
|
sparams = SamplingParams(temperature=0.0, max_tokens=2000, top_p=0.95,top_k=40,repetition_penalty=1.05) |
|
chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' |
|
prompts = [chat_template.format(input=prompt) for prompt in inputs] |
|
outputs = llm.generate(prompts, sparams) |
|
# print out the model response |
|
for output in outputs: |
|
prompt = output.prompt |
|
generated_text = output.outputs[0].text |
|
print(f"Prompt: {prompt}\nResponse: {generated_text}\n\n") |
|
``` |
|
|