antony-pk commited on
Commit
5fe0adf
1 Parent(s): cbd3c65

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ AutoModelForCausalLM,
5
+ AutoTokenizer,
6
+ TextIteratorStreamer,
7
+ )
8
+ import os
9
+ from threading import Thread
10
+ import spaces
11
+ import time
12
+
13
+ token = os.environ["HF_TOKEN"]
14
+
15
+
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ "antony-pk/Phi-3.5-mini-instruct-e3-eval50-Sep18-v1", token=token,trust_remote_code=True
18
+ )
19
+ tok = AutoTokenizer.from_pretrained("antony-pk/Phi-3.5-mini-instruct-e3-eval50-Sep18-v1", token=token)
20
+ terminators = [
21
+ tok.eos_token_id,
22
+ ]
23
+
24
+ if torch.cuda.is_available():
25
+ device = torch.device("cuda")
26
+ print(f"Using GPU: {torch.cuda.get_device_name(device)}")
27
+ else:
28
+ device = torch.device("cpu")
29
+ print("Using CPU")
30
+
31
+ model = model.to(device)
32
+ # Dispatch Errors
33
+
34
+
35
+ @spaces.GPU(duration=60)
36
+ def chat(message, history, temperature,do_sample, max_tokens):
37
+ chat = []
38
+ for item in history:
39
+ chat.append({"role": "user", "content": item[0]})
40
+ if item[1] is not None:
41
+ chat.append({"role": "assistant", "content": item[1]})
42
+ chat.append({"role": "user", "content": message})
43
+ messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
44
+ model_inputs = tok([messages], return_tensors="pt").to(device)
45
+ streamer = TextIteratorStreamer(
46
+ tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
47
+ )
48
+ generate_kwargs = dict(
49
+ model_inputs,
50
+ streamer=streamer,
51
+ max_new_tokens=max_tokens,
52
+ do_sample=True,
53
+ temperature=temperature,
54
+ eos_token_id=terminators,
55
+ )
56
+
57
+ if temperature == 0:
58
+ generate_kwargs['do_sample'] = False
59
+
60
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
61
+ t.start()
62
+
63
+ partial_text = ""
64
+ for new_text in streamer:
65
+ partial_text += new_text
66
+ yield partial_text
67
+
68
+
69
+ yield partial_text
70
+
71
+
72
+ demo = gr.ChatInterface(
73
+ fn=chat,
74
+ examples=[["Write me a poem about Machine Learning."]],
75
+ # multimodal=False,
76
+ additional_inputs_accordion=gr.Accordion(
77
+ label="⚙️ Parameters", open=False, render=False
78
+ ),
79
+ additional_inputs=[
80
+ gr.Slider(
81
+ minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
82
+ ),
83
+ gr.Checkbox(label="Sampling",value=True),
84
+ gr.Slider(
85
+ minimum=128,
86
+ maximum=4096,
87
+ step=1,
88
+ value=512,
89
+ label="Max new tokens",
90
+ render=False,
91
+ ),
92
+ ],
93
+ stop_btn="Stop Generation",
94
+ title="Chat With antony-pk/Phi-3-mini-4k-instruct-ultrachat200k model",
95
+ description="Now Running antony-pk/Phi-3.5-mini-instruct-e3-eval50-Sep18-v1"
96
+ )
97
+ demo.launch()