import os import gradio as gr from text_generation import Client # HF-hosted endpoint for testing purposes (requires an HF API token) API_TOKEN = os.environ.get("API_TOKEN", None) CURRENT_CLIENT = Client("https://afrts4trc759c6eq.us-east-1.aws.endpoints.huggingface.cloud/generate_stream", timeout=120, headers={ "Accept": "application/json", "Authorization": f"Bearer {API_TOKEN}", "Content-Type": "application/json"} ) DEFAULT_HEADER = os.environ.get("HEADER", "") DEFAULT_USER_NAME = os.environ.get("USER_NAME", "user") DEFAULT_ASSISTANT_NAME = os.environ.get("ASSISTANT_NAME", "assistant") DEFAULT_SEPARATOR = os.environ.get("SEPARATOR", "<|im_end|>") PROMPT_TEMPLATE = "<|im_start|>{user_name}\n{query}{separator}\n<|im_start|>{assistant_name}\n{response}" repo = None def get_total_inputs(inputs, chatbot, preprompt, user_name, assistant_name, sep): past = [] for data in chatbot: user_data, model_data = data if not user_data.startswith(user_name): user_data = user_name + user_data if not model_data.startswith(sep + assistant_name): model_data = sep + assistant_name + model_data past.append(user_data + model_data.rstrip() + sep) if not inputs.startswith(user_name): inputs = user_name + inputs total_inputs = preprompt + "".join(past) + inputs + sep + assistant_name.rstrip() return total_inputs def has_no_history(chatbot, history): return not chatbot and not history def generate( user_message, chatbot, history, temperature, top_p, max_new_tokens, repetition_penalty, header, user_name, assistant_name, separator ): # Don't return meaningless message when the input is empty if not user_message: print("Empty input") history.append(user_message) past_messages = [] for data in chatbot: user_data, model_data = data past_messages.extend( [{"role": "user", "content": user_data}, {"role": "assistant", "content": model_data.rstrip()}] ) print(past_messages) if len(past_messages) < 1: prompt = header + PROMPT_TEMPLATE.format(user_name=user_name, query=user_message, assistant_name=assistant_name, response="", separator=separator) else: prompt = header for i in range(0, len(past_messages), 2): intermediate_prompt = PROMPT_TEMPLATE.format(user_name=user_name, query=past_messages[i]["content"], assistant_name=assistant_name, response=past_messages[i + 1]["content"], separator=separator) # print(prompt, separator, intermediate_prompt) prompt = prompt + intermediate_prompt + separator + "\n" # print(prompt) prompt = prompt + PROMPT_TEMPLATE.format(user_name=user_name, query=user_message, assistant_name=assistant_name, response="", separator=separator) temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, top_k=40, repetition_penalty=repetition_penalty, do_sample=True, truncate=1024, # seed=42, # stop_sequences=[user_name, DEFAULT_SEPARATOR] stop_sequences=[DEFAULT_SEPARATOR] ) # print(prompt) stream = CURRENT_CLIENT.generate_stream( prompt, **generate_kwargs, ) output = "" for idx, response in enumerate(stream): # print(response.token) if response.token.text == '': pass # print(response.token.text) # break if response.token.special: continue output += response.token.text if idx == 0: history.append(" " + output) else: history[-1] = output chat = [(history[i].strip(), history[i + 1].strip()) for i in range(0, len(history) - 1, 2)] # chat = [(history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)] yield chat, history, user_message, "" return chat, history, user_message, "" def clear_chat(): return [], [] title = """