helenai commited on
Commit
7927870
1 Parent(s): 3300f28

Modify for OpenVINO models

Browse files
Files changed (3) hide show
  1. README.md +1 -2
  2. app.py +14 -17
  3. requirements.txt +1 -1
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Chatbot Transformers Streaming
3
  emoji: 👀
4
  colorFrom: gray
5
  colorTo: blue
@@ -10,4 +10,3 @@ pinned: false
10
  duplicated_from: joaogante/transformers_streaming
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: OpenVINO/Transformers Chatbot
3
  emoji: 👀
4
  colorFrom: gray
5
  colorTo: blue
 
10
  duplicated_from: joaogante/transformers_streaming
11
  ---
12
 
 
app.py CHANGED
@@ -2,21 +2,19 @@ from threading import Thread
2
 
3
  import torch
4
  import gradio as gr
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
 
6
 
7
- model_id = "declare-lab/flan-alpaca-xl"
8
- torch_device = "cuda" if torch.cuda.is_available() else "cpu"
9
- print("Running on device:", torch_device)
10
- print("CPU threads:", torch.get_num_threads())
11
 
12
-
13
- model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")
14
  tokenizer = AutoTokenizer.from_pretrained(model_id)
15
 
16
-
17
  def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
18
  # Get the model and tokenizer, and tokenize the user text.
19
- model_inputs = tokenizer([user_text], return_tensors="pt").to(torch_device)
20
 
21
  # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
22
  # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
@@ -46,16 +44,15 @@ def reset_textbox():
46
 
47
 
48
  with gr.Blocks() as demo:
49
- duplicate_link = "https://huggingface.co/spaces/joaogante/transformers_streaming?duplicate=true"
50
  gr.Markdown(
51
- "# 🤗 Transformers 🔥Streaming🔥 on Gradio\n"
52
  "This demo showcases the use of the "
53
  "[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
54
- "of 🤗 Transformers with Gradio to generate text in real-time. It uses "
55
- f"[{model_id}](https://huggingface.co/{model_id}), "
56
- "loaded in 8-bit quantized form.\n\n"
57
- f"Feel free to [duplicate this Space]({duplicate_link}) to try your own models or use this space as a "
58
- "template! 💛"
59
  )
60
 
61
  with gr.Row():
@@ -84,4 +81,4 @@ with gr.Blocks() as demo:
84
  user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
85
  button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
86
 
87
- demo.queue(max_size=32).launch(enable_queue=True)
 
2
 
3
  import torch
4
  import gradio as gr
5
+ from transformers import AutoTokenizer, TextIteratorStreamer
6
+ from optimum.intel.openvino import OVModelForSeq2SeqLM
7
 
8
+ original_model_id = "declare-lab/flan-alpaca-xl"
9
+ original_model_id = "declare-lab/flan-alpaca-large"
10
+ model_id = f"helenai/{original_model_id.replace('/','-')}-ov"
 
11
 
12
+ model = OVModelForSeq2SeqLM.from_pretrained(model_id)
 
13
  tokenizer = AutoTokenizer.from_pretrained(model_id)
14
 
 
15
  def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
16
  # Get the model and tokenizer, and tokenize the user text.
17
+ model_inputs = tokenizer([user_text], return_tensors="pt")
18
 
19
  # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
20
  # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
 
44
 
45
 
46
  with gr.Blocks() as demo:
47
+ original_link = "https://huggingface.co/spaces/joaogante/transformers_streaming"
48
  gr.Markdown(
49
+ "# OpenVINO and 🤗 Transformers 🔥Streaming🔥 on Gradio\n"
50
  "This demo showcases the use of the "
51
  "[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) "
52
+ "of 🤗 Transformers with OpenVINO models and Gradio to generate text in real-time. It uses "
53
+ f"[{original_model_id}](https://huggingface.co/{original_model_id}), "
54
+ "converted to OpenVINO.\n\n"
55
+ f"This space was duplicated from {original_link} and modified for OpenVINO models."
 
56
  )
57
 
58
  with gr.Row():
 
81
  user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
82
  button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
83
 
84
+ demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0")
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  accelerate
2
- bitsandbytes
3
  torch
4
  git+https://github.com/huggingface/transformers.git # transformers from main (TextIteratorStreamer will be added in v4.28)
 
 
1
  accelerate
 
2
  torch
3
  git+https://github.com/huggingface/transformers.git # transformers from main (TextIteratorStreamer will be added in v4.28)
4
+ optimum-intel[openvino]