Spaces:

helenai
/

openvino_transformers_streaming

Sleeping

App Files Files Community

helenai commited on Apr 6, 2023

Commit

7927870

•

1 Parent(s): 3300f28

Modify for OpenVINO models

Browse files

Files changed (3) hide show

README.md +1 -2
app.py +14 -17
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Chatbot Transformers Streaming
 emoji: 👀
 colorFrom: gray
 colorTo: blue
@@ -10,4 +10,3 @@ pinned: false
 duplicated_from: joaogante/transformers_streaming
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OpenVINO/Transformers Chatbot
 emoji: 👀
 colorFrom: gray
 colorTo: blue
 duplicated_from: joaogante/transformers_streaming
 ---

app.py CHANGED Viewed

@@ -2,21 +2,19 @@ from threading import Thread
 import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
-model_id = "declare-lab/flan-alpaca-xl"
-torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-print("Running on device:", torch_device)
-print("CPU threads:", torch.get_num_threads())
-model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
     # Get the model and tokenizer, and tokenize the user text.
-    model_inputs = tokenizer([user_text], return_tensors="pt").to(torch_device)
     # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
     # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
@@ -46,16 +44,15 @@ def reset_textbox():
 with gr.Blocks() as demo:
-    duplicate_link = "https://huggingface.co/spaces/joaogante/transformers_streaming?duplicate=true"
     gr.Markdown(
-        "# 🤗 Transformers 🔥Streaming🔥 on Gradio\n"
         "This demo showcases the use of the "
         "[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
-        "of 🤗 Transformers with Gradio to generate text in real-time. It uses "
-        f"[{model_id}](https://huggingface.co/{model_id}), "
-        "loaded in 8-bit quantized form.\n\n"
-        f"Feel free to [duplicate this Space]({duplicate_link}) to try your own models or use this space as a "
-        "template! 💛"
     )
     with gr.Row():
@@ -84,4 +81,4 @@ with gr.Blocks() as demo:
     user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
     button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
-    demo.queue(max_size=32).launch(enable_queue=True)

 import torch
 import gradio as gr
+from transformers import AutoTokenizer, TextIteratorStreamer
+from optimum.intel.openvino import OVModelForSeq2SeqLM
+original_model_id = "declare-lab/flan-alpaca-xl"
+original_model_id = "declare-lab/flan-alpaca-large"
+model_id = f"helenai/{original_model_id.replace('/','-')}-ov"
+model = OVModelForSeq2SeqLM.from_pretrained(model_id)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
     # Get the model and tokenizer, and tokenize the user text.
+    model_inputs = tokenizer([user_text], return_tensors="pt")
     # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
     # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
 with gr.Blocks() as demo:
+    original_link = "https://huggingface.co/spaces/joaogante/transformers_streaming"
     gr.Markdown(
+        "# OpenVINO and 🤗 Transformers 🔥Streaming🔥 on Gradio\n"
         "This demo showcases the use of the "
         "[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
+        "of 🤗 Transformers with OpenVINO models and Gradio to generate text in real-time. It uses "
+        f"[{original_model_id}](https://huggingface.co/{original_model_id}), "
+        "converted to OpenVINO.\n\n"
+        f"This space was duplicated from {original_link} and modified for OpenVINO models."
     )
     with gr.Row():
     user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
     button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
+    demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0")

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
 accelerate
-bitsandbytes
 torch
 git+https://github.com/huggingface/transformers.git  # transformers from main (TextIteratorStreamer will be added in v4.28)

 accelerate
 torch
 git+https://github.com/huggingface/transformers.git  # transformers from main (TextIteratorStreamer will be added in v4.28)
+optimum-intel[openvino]