shamik commited on
Commit
b13f01f
β€’
1 Parent(s): 28a9b4f

Changed the app.py.

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +70 -58
  3. creds.env +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .ipynb_checkpoints/
app.py CHANGED
@@ -1,76 +1,88 @@
1
- import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import os
4
 
5
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
 
7
- """
8
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
9
- """
10
- client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
11
-
12
 
13
- def respond(
14
- message,
15
- history: list[tuple[str, str]],
16
- system_message,
17
- max_tokens,
18
- temperature,
19
- top_p,
20
  ):
21
- messages = [{"role": "system", "content": system_message}]
 
 
 
22
 
23
- for val in history:
24
- if val[0]:
25
- messages.append({"role": "user", "content": val[0]})
26
- if val[1]:
27
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
28
 
29
- messages.append({"role": "user", "content": message})
30
 
31
- response = ""
 
32
 
33
- for message in client.chat_completion(
34
- messages,
35
- max_tokens=max_tokens,
36
- stream=True,
37
- temperature=temperature,
38
- top_p=top_p,
39
- ):
40
- token = message.choices[0].delta.content
41
 
42
- response += token
43
- yield response
44
 
45
- """
46
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
47
- """
48
- demo = gr.ChatInterface(
49
- fn=respond,
50
- chatbot=gr.Chatbot(show_label=False, show_share_button=False,
51
- show_copy_button=True, likeable=True, layout="panel"),
52
- title="""Have a chat with LLama3 8B""",
53
- additional_inputs=[
54
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens",interactive=True,
55
- info="The maximum numbers of new tokens"),
56
- gr.Slider(minimum=0.1, maximum=4.0, value=0.1, step=0.1, label="Temperature",interactive=True,
57
- info="Higher values produce more diverse outputs"),
58
- gr.Slider(
59
- minimum=0.1,
60
- maximum=1.0,
61
- value=0.95,
62
- step=0.05,
63
- label="Top-p (nucleus sampling)",
64
  interactive=True,
65
- info="Higher values sample more low-probability tokens"),
66
- gr.Textbox(value="You are a truthful and friendly Assistant.", label="System message"),
67
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  examples=[
69
  ["Can you explain briefly to me what is the Python programming language?"],
70
  ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
71
  ]
72
- )
73
-
74
-
75
- if __name__ == "__main__":
76
- demo.launch(show_api=False)
 
 
1
  from huggingface_hub import InferenceClient
2
  import os
3
 
4
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
5
 
6
+ client = InferenceClient(
7
+ "meta-llama/Meta-Llama-3-8B-Instruct"
8
+ )
 
 
9
 
10
+ punctuation_marks = [".", "!", "?"]
11
+ def generate(
12
+ prompt, history, temperature=0.2, max_new_tokens=256, top_p=0.8, repetition_penalty=1.0,
 
 
 
 
13
  ):
14
+ temperature = float(temperature)
15
+ if temperature < 1e-2:
16
+ temperature = 1e-2
17
+ top_p = float(top_p)
18
 
19
+ generate_kwargs = dict(
20
+ temperature=temperature,
21
+ max_new_tokens=max_new_tokens,
22
+ top_p=top_p,
23
+ repetition_penalty=repetition_penalty,
24
+ do_sample=True,
25
+ seed=42,
26
+ )
27
 
 
28
 
29
+ stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
30
+ output = ""
31
 
32
+ for response in stream:
33
+ output += response.token.text
34
+ while output and output[-1] not in punctuation_marks:
35
+ output = output[:-1]
36
+ # yield output
37
+ return output
 
 
38
 
 
 
39
 
40
+ additional_inputs=[
41
+ gr.Slider(
42
+ label="Temperature",
43
+ value=0.2,
44
+ minimum=0.0,
45
+ maximum=1.0,
46
+ step=0.05,
47
+ interactive=True,
48
+ info="Higher values produce more diverse outputs",
49
+ ),
50
+ gr.Slider(
51
+ label="Max new tokens",
52
+ value=256,
53
+ minimum=0,
54
+ maximum=1048,
55
+ step=64,
 
 
 
56
  interactive=True,
57
+ info="The maximum numbers of new tokens",
58
+ ),
59
+ gr.Slider(
60
+ label="Top-p (nucleus sampling)",
61
+ value=0.80,
62
+ minimum=0.0,
63
+ maximum=1,
64
+ step=0.05,
65
+ interactive=True,
66
+ info="Higher values sample more low-probability tokens",
67
+ ),
68
+ gr.Slider(
69
+ label="Repetition penalty",
70
+ value=1.0,
71
+ minimum=1.0,
72
+ maximum=2.0,
73
+ step=0.05,
74
+ interactive=True,
75
+ info="Penalize repeated tokens",
76
+ )
77
+ ]
78
+
79
+
80
+ gr.ChatInterface(
81
+ fn=generate,
82
+ chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
83
+ additional_inputs=additional_inputs,
84
  examples=[
85
  ["Can you explain briefly to me what is the Python programming language?"],
86
  ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
87
  ]
88
+ ).launch(show_api=False)
 
 
 
 
creds.env ADDED
File without changes