File size: 2,983 Bytes
c47d064
 
 
c0f7cf1
 
 
 
 
 
 
c47d064
 
 
c0f7cf1
 
 
 
 
 
4fd05d8
c0f7cf1
 
 
 
 
 
 
 
 
 
 
4fd05d8
 
c0f7cf1
c47d064
4fd05d8
c0f7cf1
 
 
 
 
c47d064
4fd05d8
c0f7cf1
c47d064
4fd05d8
c47d064
 
4fd05d8
c0f7cf1
c47d064
 
 
 
 
 
 
 
4fd05d8
 
 
c47d064
c0f7cf1
4fd05d8
c0f7cf1
 
4d60f4c
c47d064
c0f7cf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c47d064
 
 
c0f7cf1
c47d064
 
 
 
 
 
 
c0f7cf1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
from huggingface_hub import InferenceClient


SYSTEM_MESSAGE_DEFAULT = "You are a friendly Chatbot."
MAX_TOKENS_DEFAULT = 512
TEMPERATURE_DEFAULT = 0.7
TOP_P_DEFAULT = 0.95

inference_client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")


def respond(
    user_message: str,
    conversation_history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
):
    """
    Respond to a user message given the conversation history and other parameters.

    Args:
        user_message (str): The user's message.
        conversation_history (list[tuple[str, str]]): The conversation history.
        system_message (str): The system message to display at the top of the chat interface.
        max_tokens (int): The maximum number of tokens to generate in the response.
        temperature (float): The temperature to use when generating text.
        top_p (float): The top-p value to use when generating text.

    Yields:
        list[tuple[str, str]]: Updated conversation history with the new assistant response.
    """
    messages = [{"role": "system", "content": system_message}]
    # Prepare messages for the model based on the history
    for user_input, assistant_response in conversation_history:
        if user_input:
            messages.append({"role": "user", "content": user_input})
        if assistant_response:
            messages.append({"role": "assistant", "content": assistant_response})

    # Append the new user message
    messages.append({"role": "user", "content": user_message})

    # Initialize response string
    response = ""

    # Stream the completion from the inference client
    for message in inference_client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content
        response += token
        # Continuously yield updated history with the new response
        updated_history = conversation_history + [(user_message, response)]
        yield updated_history


# Chatbot interface definition
chatbot_interface = gr.ChatInterface(
    fn=respond,
    chatbot=gr.Chatbot(height=600),
    additional_inputs=[
        gr.Textbox(
            value=SYSTEM_MESSAGE_DEFAULT,
            label="System message",
        ),
        gr.Slider(
            minimum=1,
            maximum=2048,
            value=MAX_TOKENS_DEFAULT,
            step=1,
            label="Max new tokens",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=TEMPERATURE_DEFAULT,
            step=0.1,
            label="Temperature",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=TOP_P_DEFAULT,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    chatbot_interface.launch()