Spaces:

akashkj
/

H2OGPT

Runtime error

File size: 12,469 Bytes

3f7cfab

import asyncio
import collections
from typing import Any, Dict, List, Optional, OrderedDict, Tuple

import gradio_client  # type: ignore

from h2ogpt_client import enums


class Client:
    def __init__(self, server_url: str, huggingface_token: Optional[str] = None):
        self._client = gradio_client.Client(
            src=server_url, hf_token=huggingface_token, serialize=False, verbose=False
        )
        self._text_completion = TextCompletion(self)
        self._chat_completion = ChatCompletion(self)

    @property
    def text_completion(self) -> "TextCompletion":
        return self._text_completion

    @property
    def chat_completion(self) -> "ChatCompletion":
        return self._chat_completion

    def _predict(self, *args, api_name: str) -> Any:
        return self._client.submit(*args, api_name=api_name).result()

    async def _predict_async(self, *args, api_name: str) -> str:
        return await asyncio.wrap_future(self._client.submit(*args, api_name=api_name))


class TextCompletion:
    """Text completion"""

    def __init__(self, client: Client):
        self._client = client

    def create(
        self,
        prompt: str,
        prompt_type: enums.PromptType = enums.PromptType.plain,
        input_context_for_instruction: str = "",
        enable_sampler=False,
        temperature: float = 1.0,
        top_p: float = 1.0,
        top_k: int = 40,
        beams: float = 1.0,
        early_stopping: bool = False,
        min_output_length: int = 0,
        max_output_length: int = 128,
        max_time: int = 180,
        repetition_penalty: float = 1.07,
        number_returns: int = 1,
        system_pre_context: str = "",
        langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
    ) -> str:
        """
        Creates a new text completion.

        :param prompt: text prompt to generate completions for
        :param prompt_type: type of the prompt
        :param input_context_for_instruction: input context for instruction
        :param enable_sampler: enable or disable the sampler, required for use of
                temperature, top_p, top_k
        :param temperature: What sampling temperature to use, between 0 and 3.
                Lower values will make it more focused and deterministic, but may lead
                to repeat. Higher values will make the output more creative, but may
                lead to hallucinations.
        :param top_p: cumulative probability of tokens to sample from
        :param top_k: number of tokens to sample from
        :param beams: Number of searches for optimal overall probability.
                Higher values uses more GPU memory and compute.
        :param early_stopping: whether to stop early or not in beam search
        :param min_output_length: minimum output length
        :param max_output_length: maximum output length
        :param max_time: maximum time to search optimal output
        :param repetition_penalty: penalty for repetition
        :param number_returns:
        :param system_pre_context: directly pre-appended without prompt processing
        :param langchain_mode: LangChain mode
        :return: response from the model
        """
        # Not exposed parameters.
        instruction = ""  # empty when chat_mode is False
        input = ""  # only chat_mode is True
        stream_output = False
        prompt_dict = ""  # empty as prompt_type cannot be 'custom'
        chat_mode = False
        langchain_top_k_docs = 4  # number of document chunks; not public
        langchain_enable_chunk = True  # whether to chunk documents; not public
        langchain_chunk_size = 512  # chunk size for document chunking; not public
        langchain_document_choice = ["All"]

        return self._client._predict(
            instruction,
            input,
            system_pre_context,
            stream_output,
            prompt_type.value,
            prompt_dict,
            temperature,
            top_p,
            top_k,
            beams,
            max_output_length,
            min_output_length,
            early_stopping,
            max_time,
            repetition_penalty,
            number_returns,
            enable_sampler,
            chat_mode,
            prompt,
            input_context_for_instruction,
            langchain_mode.value,
            langchain_top_k_docs,
            langchain_enable_chunk,
            langchain_chunk_size,
            langchain_document_choice,
            api_name="/submit_nochat",
        )

    async def create_async(
        self,
        prompt: str,
        prompt_type: enums.PromptType = enums.PromptType.plain,
        input_context_for_instruction: str = "",
        enable_sampler=False,
        temperature: float = 1.0,
        top_p: float = 1.0,
        top_k: int = 40,
        beams: float = 1.0,
        early_stopping: bool = False,
        min_output_length: int = 0,
        max_output_length: int = 128,
        max_time: int = 180,
        repetition_penalty: float = 1.07,
        number_returns: int = 1,
        system_pre_context: str = "",
        langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
    ) -> str:
        """
        Creates a new text completion asynchronously.

        :param prompt: text prompt to generate completions for
        :param prompt_type: type of the prompt
        :param input_context_for_instruction: input context for instruction
        :param enable_sampler: enable or disable the sampler, required for use of
                temperature, top_p, top_k
        :param temperature: What sampling temperature to use, between 0 and 3.
                Lower values will make it more focused and deterministic, but may lead
                to repeat. Higher values will make the output more creative, but may
                lead to hallucinations.
        :param top_p: cumulative probability of tokens to sample from
        :param top_k: number of tokens to sample from
        :param beams: Number of searches for optimal overall probability.
                Higher values uses more GPU memory and compute.
        :param early_stopping: whether to stop early or not in beam search
        :param min_output_length: minimum output length
        :param max_output_length: maximum output length
        :param max_time: maximum time to search optimal output
        :param repetition_penalty: penalty for repetition
        :param number_returns:
        :param system_pre_context: directly pre-appended without prompt processing
        :param langchain_mode: LangChain mode
        :return: response from the model
        """
        # Not exposed parameters.
        instruction = ""  # empty when chat_mode is False
        input = ""  # only chat_mode is True
        stream_output = False
        prompt_dict = ""  # empty as prompt_type cannot be 'custom'
        chat_mode = False
        langchain_top_k_docs = 4  # number of document chunks; not public
        langchain_enable_chunk = True  # whether to chunk documents; not public
        langchain_chunk_size = 512  # chunk size for document chunking; not public
        langchain_document_choice = ["All"]  # not public

        return await self._client._predict_async(
            instruction,
            input,
            system_pre_context,
            stream_output,
            prompt_type.value,
            prompt_dict,
            temperature,
            top_p,
            top_k,
            beams,
            max_output_length,
            min_output_length,
            early_stopping,
            max_time,
            repetition_penalty,
            number_returns,
            enable_sampler,
            chat_mode,
            prompt,
            input_context_for_instruction,
            langchain_mode.value,
            langchain_top_k_docs,
            langchain_enable_chunk,
            langchain_chunk_size,
            langchain_document_choice,
            api_name="/submit_nochat",
        )


class ChatCompletion:
    """Chat completion"""

    def __init__(self, client: Client):
        self._client = client

    def create(
        self,
        prompt_type: enums.PromptType = enums.PromptType.plain,
        input_context_for_instruction: str = "",
        enable_sampler=False,
        temperature: float = 1.0,
        top_p: float = 1.0,
        top_k: int = 40,
        beams: float = 1.0,
        early_stopping: bool = False,
        min_output_length: int = 0,
        max_output_length: int = 128,
        max_time: int = 180,
        repetition_penalty: float = 1.07,
        number_returns: int = 1,
        system_pre_context: str = "",
        langchain_mode: enums.LangChainMode = enums.LangChainMode.DISABLED,
    ) -> "ChatContext":
        """
        Creates a new text completion asynchronously.

        :param prompt_type: type of the prompt
        :param input_context_for_instruction: input context for instruction
        :param enable_sampler: enable or disable the sampler, required for use of
                temperature, top_p, top_k
        :param temperature: What sampling temperature to use, between 0 and 3.
                Lower values will make it more focused and deterministic, but may lead
                to repeat. Higher values will make the output more creative, but may
                lead to hallucinations.
        :param top_p: cumulative probability of tokens to sample from
        :param top_k: number of tokens to sample from
        :param beams: Number of searches for optimal overall probability.
                Higher values uses more GPU memory and compute.
        :param early_stopping: whether to stop early or not in beam search
        :param min_output_length: minimum output length
        :param max_output_length: maximum output length
        :param max_time: maximum time to search optimal output
        :param repetition_penalty: penalty for repetition
        :param number_returns:
        :param system_pre_context: directly pre-appended without prompt processing
        :param langchain_mode: LangChain mode
        :return: a chat context with given parameters
        """
        kwargs = collections.OrderedDict(
            instruction=None,  # future prompts
            input="",  # ??
            system_pre_context=system_pre_context,
            stream_output=False,
            prompt_type=prompt_type.value,
            prompt_dict="",  # empty as prompt_type cannot be 'custom'
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            beams=beams,
            max_output_length=max_output_length,
            min_output_length=min_output_length,
            early_stopping=early_stopping,
            max_time=max_time,
            repetition_penalty=repetition_penalty,
            number_returns=number_returns,
            enable_sampler=enable_sampler,
            chat_mode=True,
            instruction_nochat="",  # empty when chat_mode is True
            input_context_for_instruction=input_context_for_instruction,
            langchain_mode=langchain_mode.value,
            langchain_top_k_docs=4,  # number of document chunks; not public
            langchain_enable_chunk=True,  # whether to chunk documents; not public
            langchain_chunk_size=512,  # chunk size for document chunking; not public
            langchain_document_choice=["All"],  # not public
            chatbot=[],  # chat history
        )
        return ChatContext(self._client, kwargs)


class ChatContext:
    """ "Chat context"""

    def __init__(self, client: Client, kwargs: OrderedDict[str, Any]):
        self._client = client
        self._kwargs = kwargs

    def chat(self, prompt: str) -> Dict[str, str]:
        """
        Chat with the GPT.

        :param prompt: text prompt to generate completions for
        :returns chat reply
        """
        self._kwargs["instruction"] = prompt
        self._kwargs["chatbot"] += [[prompt, None]]
        response: Tuple[List[List[str]], str] = self._client._predict(
            *self._kwargs.values(), api_name="/instruction_bot"
        )
        self._kwargs["chatbot"][-1][1] = response[0][-1][1]
        return {"user": response[0][-1][0], "gpt": response[0][-1][1]}

    def chat_history(self) -> List[Dict[str, str]]:
        """Returns the full chat history."""
        return [{"user": i[0], "gpt": i[1]} for i in self._kwargs["chatbot"]]