antony-pk commited on
Commit
d5471f1
1 Parent(s): 7e64fd9

Full application

Browse files
Files changed (1) hide show
  1. app.py +96 -4
app.py CHANGED
@@ -10,9 +10,101 @@ import threading import Thread
10
  import spaces
11
  import time
12
 
13
- token = os.environ["HF_TOKEN"]
14
- model_name = ""
15
 
16
  model = AutoModelForCausalLM.from_pretrained(
17
-
18
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import spaces
11
  import time
12
 
13
+ hf_token = os.environ["HF_TOKEN"]
14
+ model_name = os.environ["MODEL_NAME"]
15
 
16
  model = AutoModelForCausalLM.from_pretrained(
17
+ model_name,
18
+ token=hf_token
19
+ )
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
22
+
23
+ terminators = [
24
+ tokenizer.eos_token_id,
25
+ ]
26
+
27
+ if torch.cude.is_available():
28
+ device = torch.device("cuda")
29
+ print(f"Using GPU: {torch.cuda.get_device_name(device)}")
30
+ else:
31
+ device = torch.device("cpu")
32
+ print("Using CPU")
33
+
34
+ model = model.to(device)
35
+
36
+ @spaces.GPU(duration=60)
37
+ def chat(message, history, temperature, do_sample, max_tokens):
38
+ chat = []
39
+ for item in history:
40
+ chat.append({
41
+ "role": "user",
42
+ "content": item[0]
43
+ })
44
+ if item[1] is not None:
45
+ chat.append({
46
+ "role": "assistant",
47
+ "content": item[1]
48
+ })
49
+ chat.append({
50
+ "role": "user",
51
+ "content": message
52
+ })
53
+
54
+ messages = tokenizer.apply_chat_template(chat, tokenize=False, add_gereration_prompt=True)
55
+ model_inputs = tokenizer([messages], return_tensors="pt").to(device)
56
+ streamer = TextIteratorStreamer(
57
+ tokenizer,
58
+ timeout=20,
59
+ skip_prompt=True,
60
+ skip_special_tokens=True
61
+ )
62
+ generate_kwargs = dict(
63
+ model_inputs,
64
+ streamer=streamer,
65
+ max_new_tokens=max_tokens,
66
+ temperature=temperature,
67
+ eos_token_id=terminators
68
+ )
69
+
70
+ if temperature == 0:
71
+ generate_kwargs["do_sample"] = False
72
+
73
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
74
+ t.start()
75
+
76
+ partial_text = ""
77
+ for new_text in streamer:
78
+ partial_text += new_text
79
+ yield partial_text
80
+
81
+ yield partial_text
82
+
83
+
84
+ demo = gr.ChatInterface(
85
+ fn=chat,
86
+ examples=[["write me a poem about machine Learning"]],
87
+ additional_inputs_accordion=gr.Accordion(
88
+ label="⚙️ Parameters", open=False, render=False
89
+ ),
90
+ additional_inputs=[
91
+ gr.Slider(
92
+ minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
93
+ ),
94
+ gr.Checkbox(label="Sampling", value=True),
95
+ gr.Slider(
96
+ minimum=128,
97
+ maximum=4096,
98
+ step=1,
99
+ value=512,
100
+ label="Max new tokens",
101
+ render=False
102
+ ),
103
+
104
+ ],
105
+ stop_btn="Stop Generation",
106
+ title="Chat with Phi3.5 ERPNext",
107
+ description="Noew Running antony - Phi3.5 ERPNext"
108
+ )
109
+
110
+ demo.launch()