akashkj's picture
Upload folder using huggingface_hub
3f7cfab
raw
history blame
12.5 kB
import asyncio
import collections
from typing import Any, Dict, List, Optional, OrderedDict, Tuple
import gradio_client # type: ignore
from h2ogpt_client import enums
class Client:
def __init__(self, server_url: str, huggingface_token: Optional[str] = None):
self._client = gradio_client.Client(
src=server_url, hf_token=huggingface_token, serialize=False, verbose=False
)
self._text_completion = TextCompletion(self)
self._chat_completion = ChatCompletion(self)
@property
def text_completion(self) -> "TextCompletion":
return self._text_completion
@property
def chat_completion(self) -> "ChatCompletion":
return self._chat_completion
def _predict(self, *args, api_name: str) -> Any:
return self._client.submit(*args, api_name=api_name).result()
async def _predict_async(self, *args, api_name: str) -> str:
return await asyncio.wrap_future(self._client.submit(*args, api_name=api_name))
class TextCompletion:
"""Text completion"""
def __init__(self, client: Client):
self._client = client
def create(
self,
prompt: str,
prompt_type: enums.PromptType = enums.PromptType.plain,
input_context_for_instruction: str = "",
enable_sampler=False,
temperature: float = 1.0,
top_p: float = 1.0,
top_k: int = 40,
beams: float = 1.0,
early_stopping: bool = False,
min_output_length: int = 0,
max_output_length: int = 128,
max_time: int = 180,
repetition_penalty: float = 1.07,
number_returns: int = 1,
system_pre_context: str = "",
langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
) -> str:
"""
Creates a new text completion.
:param prompt: text prompt to generate completions for
:param prompt_type: type of the prompt
:param input_context_for_instruction: input context for instruction
:param enable_sampler: enable or disable the sampler, required for use of
temperature, top_p, top_k
:param temperature: What sampling temperature to use, between 0 and 3.
Lower values will make it more focused and deterministic, but may lead
to repeat. Higher values will make the output more creative, but may
lead to hallucinations.
:param top_p: cumulative probability of tokens to sample from
:param top_k: number of tokens to sample from
:param beams: Number of searches for optimal overall probability.
Higher values uses more GPU memory and compute.
:param early_stopping: whether to stop early or not in beam search
:param min_output_length: minimum output length
:param max_output_length: maximum output length
:param max_time: maximum time to search optimal output
:param repetition_penalty: penalty for repetition
:param number_returns:
:param system_pre_context: directly pre-appended without prompt processing
:param langchain_mode: LangChain mode
:return: response from the model
"""
# Not exposed parameters.
instruction = "" # empty when chat_mode is False
input = "" # only chat_mode is True
stream_output = False
prompt_dict = "" # empty as prompt_type cannot be 'custom'
chat_mode = False
langchain_top_k_docs = 4 # number of document chunks; not public
langchain_enable_chunk = True # whether to chunk documents; not public
langchain_chunk_size = 512 # chunk size for document chunking; not public
langchain_document_choice = ["All"]
return self._client._predict(
instruction,
input,
system_pre_context,
stream_output,
prompt_type.value,
prompt_dict,
temperature,
top_p,
top_k,
beams,
max_output_length,
min_output_length,
early_stopping,
max_time,
repetition_penalty,
number_returns,
enable_sampler,
chat_mode,
prompt,
input_context_for_instruction,
langchain_mode.value,
langchain_top_k_docs,
langchain_enable_chunk,
langchain_chunk_size,
langchain_document_choice,
api_name="/submit_nochat",
)
async def create_async(
self,
prompt: str,
prompt_type: enums.PromptType = enums.PromptType.plain,
input_context_for_instruction: str = "",
enable_sampler=False,
temperature: float = 1.0,
top_p: float = 1.0,
top_k: int = 40,
beams: float = 1.0,
early_stopping: bool = False,
min_output_length: int = 0,
max_output_length: int = 128,
max_time: int = 180,
repetition_penalty: float = 1.07,
number_returns: int = 1,
system_pre_context: str = "",
langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
) -> str:
"""
Creates a new text completion asynchronously.
:param prompt: text prompt to generate completions for
:param prompt_type: type of the prompt
:param input_context_for_instruction: input context for instruction
:param enable_sampler: enable or disable the sampler, required for use of
temperature, top_p, top_k
:param temperature: What sampling temperature to use, between 0 and 3.
Lower values will make it more focused and deterministic, but may lead
to repeat. Higher values will make the output more creative, but may
lead to hallucinations.
:param top_p: cumulative probability of tokens to sample from
:param top_k: number of tokens to sample from
:param beams: Number of searches for optimal overall probability.
Higher values uses more GPU memory and compute.
:param early_stopping: whether to stop early or not in beam search
:param min_output_length: minimum output length
:param max_output_length: maximum output length
:param max_time: maximum time to search optimal output
:param repetition_penalty: penalty for repetition
:param number_returns:
:param system_pre_context: directly pre-appended without prompt processing
:param langchain_mode: LangChain mode
:return: response from the model
"""
# Not exposed parameters.
instruction = "" # empty when chat_mode is False
input = "" # only chat_mode is True
stream_output = False
prompt_dict = "" # empty as prompt_type cannot be 'custom'
chat_mode = False
langchain_top_k_docs = 4 # number of document chunks; not public
langchain_enable_chunk = True # whether to chunk documents; not public
langchain_chunk_size = 512 # chunk size for document chunking; not public
langchain_document_choice = ["All"] # not public
return await self._client._predict_async(
instruction,
input,
system_pre_context,
stream_output,
prompt_type.value,
prompt_dict,
temperature,
top_p,
top_k,
beams,
max_output_length,
min_output_length,
early_stopping,
max_time,
repetition_penalty,
number_returns,
enable_sampler,
chat_mode,
prompt,
input_context_for_instruction,
langchain_mode.value,
langchain_top_k_docs,
langchain_enable_chunk,
langchain_chunk_size,
langchain_document_choice,
api_name="/submit_nochat",
)
class ChatCompletion:
"""Chat completion"""
def __init__(self, client: Client):
self._client = client
def create(
self,
prompt_type: enums.PromptType = enums.PromptType.plain,
input_context_for_instruction: str = "",
enable_sampler=False,
temperature: float = 1.0,
top_p: float = 1.0,
top_k: int = 40,
beams: float = 1.0,
early_stopping: bool = False,
min_output_length: int = 0,
max_output_length: int = 128,
max_time: int = 180,
repetition_penalty: float = 1.07,
number_returns: int = 1,
system_pre_context: str = "",
langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
) -> "ChatContext":
"""
Creates a new text completion asynchronously.
:param prompt_type: type of the prompt
:param input_context_for_instruction: input context for instruction
:param enable_sampler: enable or disable the sampler, required for use of
temperature, top_p, top_k
:param temperature: What sampling temperature to use, between 0 and 3.
Lower values will make it more focused and deterministic, but may lead
to repeat. Higher values will make the output more creative, but may
lead to hallucinations.
:param top_p: cumulative probability of tokens to sample from
:param top_k: number of tokens to sample from
:param beams: Number of searches for optimal overall probability.
Higher values uses more GPU memory and compute.
:param early_stopping: whether to stop early or not in beam search
:param min_output_length: minimum output length
:param max_output_length: maximum output length
:param max_time: maximum time to search optimal output
:param repetition_penalty: penalty for repetition
:param number_returns:
:param system_pre_context: directly pre-appended without prompt processing
:param langchain_mode: LangChain mode
:return: a chat context with given parameters
"""
kwargs = collections.OrderedDict(
instruction=None, # future prompts
input="", # ??
system_pre_context=system_pre_context,
stream_output=False,
prompt_type=prompt_type.value,
prompt_dict="", # empty as prompt_type cannot be 'custom'
temperature=temperature,
top_p=top_p,
top_k=top_k,
beams=beams,
max_output_length=max_output_length,
min_output_length=min_output_length,
early_stopping=early_stopping,
max_time=max_time,
repetition_penalty=repetition_penalty,
number_returns=number_returns,
enable_sampler=enable_sampler,
chat_mode=True,
instruction_nochat="", # empty when chat_mode is True
input_context_for_instruction=input_context_for_instruction,
langchain_mode=langchain_mode.value,
langchain_top_k_docs=4, # number of document chunks; not public
langchain_enable_chunk=True, # whether to chunk documents; not public
langchain_chunk_size=512, # chunk size for document chunking; not public
langchain_document_choice=["All"], # not public
chatbot=[], # chat history
)
return ChatContext(self._client, kwargs)
class ChatContext:
""" "Chat context"""
def __init__(self, client: Client, kwargs: OrderedDict[str, Any]):
self._client = client
self._kwargs = kwargs
def chat(self, prompt: str) -> Dict[str, str]:
"""
Chat with the GPT.
:param prompt: text prompt to generate completions for
:returns chat reply
"""
self._kwargs["instruction"] = prompt
self._kwargs["chatbot"] += [[prompt, None]]
response: Tuple[List[List[str]], str] = self._client._predict(
*self._kwargs.values(), api_name="/instruction_bot"
)
self._kwargs["chatbot"][-1][1] = response[0][-1][1]
return {"user": response[0][-1][0], "gpt": response[0][-1][1]}
def chat_history(self) -> List[Dict[str, str]]:
"""Returns the full chat history."""
return [{"user": i[0], "gpt": i[1]} for i in self._kwargs["chatbot"]]