Spaces:

helenai
/

openvino_transformers_streaming

Sleeping

App Files Files Community

openvino_transformers_streaming / app.py

helenai

Switch to Mistral model

72324f9 8 months ago

raw

history blame contribute delete

4.16 kB

	import pprint
	import subprocess
	from threading import Thread

	import gradio as gr
	from optimum.intel.openvino import OVModelForCausalLM
	from transformers import AutoTokenizer, TextIteratorStreamer

	result = subprocess.run(["lscpu"], text=True, capture_output=True)
	pprint.pprint(result.stdout)

	original_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
	model_id = "helenai/mistralai-Mistral-7B-Instruct-v0.2-ov"

	model = OVModelForCausalLM.from_pretrained(model_id)
	tokenizer = AutoTokenizer.from_pretrained(model_id)


	def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
	# message = [{"role": "user", "content": "You are a helpful assistant"}, {"role": "assistant", "content": "How can I help?"}, {"role":"user", "content":user_text}]
	message = [{"role": "user", "content": user_text}]

	model_inputs = tokenizer.apply_chat_template(message, return_tensors="pt", return_dict=True)

	# Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
	# in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
	streamer = TextIteratorStreamer(
	tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
	)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	temperature=float(temperature),
	top_k=top_k,
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()

	# Pull the generated text from the streamer, and update the model output.
	model_output = ""
	for new_text in streamer:
	model_output += new_text
	yield model_output
	return model_output


	def reset_textbox():
	return gr.update(value="")


	with gr.Blocks() as demo:
	original_link = "https://huggingface.co/spaces/joaogante/transformers_streaming"
	gr.Markdown(
	"# OpenVINO and 🤗 Transformers 🔥Streaming🔥 on Gradio\n"
	"This demo showcases the use of the "
	"[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
	"of 🤗 Transformers with OpenVINO models and Gradio to generate text in real-time. It uses "
	f"[{original_model_id}](https://huggingface.co/{original_model_id}), "
	"converted to OpenVINO.\n\n"
	f"This space was duplicated from {original_link} and modified for OpenVINO models."
	)

	with gr.Row():
	with gr.Column(scale=4):
	user_text = gr.Textbox(
	label="User input",
	)
	model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
	button_submit = gr.Button(value="Submit")

	with gr.Column(scale=1):
	max_new_tokens = gr.Slider(
	minimum=1,
	maximum=1000,
	value=250,
	step=1,
	interactive=True,
	label="Max New Tokens",
	)
	top_p = gr.Slider(
	minimum=0.05,
	maximum=1.0,
	value=0.95,
	step=0.05,
	interactive=True,
	label="Top-p (nucleus sampling)",
	)
	top_k = gr.Slider(
	minimum=1,
	maximum=50,
	value=50,
	step=1,
	interactive=True,
	label="Top-k",
	)
	temperature = gr.Slider(
	minimum=0.1,
	maximum=5.0,
	value=0.8,
	step=0.1,
	interactive=True,
	label="Temperature",
	)

	user_text.submit(
	run_generation,
	[user_text, top_p, temperature, top_k, max_new_tokens],
	model_output,
	)
	button_submit.click(
	run_generation,
	[user_text, top_p, temperature, top_k, max_new_tokens],
	model_output,
	)

	demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0")
	# For local use:
	# demo.launch(server_name="0.0.0.0")