Spaces:

pseudotensor
/

open-strawberry

Running

File size: 20,681 Bytes

import ast
import datetime
import os
from typing import List, Dict, Generator
from dotenv import load_dotenv

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

# Load environment variables from .env file
load_dotenv()


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def anthropic_completion_with_backoff(client, *args, **kwargs):
    return client.beta.prompt_caching.messages.create(*args, **kwargs)


def get_anthropic(model: str,
                  prompt: str,
                  temperature: float = 0,
                  max_tokens: int = 4096,
                  system: str = '',
                  chat_history: List[Dict] = None,
                  secrets: Dict = {},
                  verbose=False) -> \
        Generator[dict, None, None]:
    model = model.replace('anthropic:', '')

    # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
    import anthropic

    clawd_key = secrets.get('ANTHROPIC_API_KEY')
    clawd_client = anthropic.Anthropic(api_key=clawd_key) if clawd_key else None

    if chat_history is None:
        chat_history = []

    messages = []

    # Add conversation history, removing cache_control from all but the last two user messages
    for i, message in enumerate(chat_history):
        if message["role"] == "user":
            if i >= len(chat_history) - 3:  # Last two user messages
                messages.append(message)
            else:
                messages.append({
                    "role": "user",
                    "content": [{"type": "text", "text": message["content"][0]["text"]}]
                })
        else:
            messages.append(message)

    # Add the new user message
    messages.append({
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": prompt,
                "cache_control": {"type": "ephemeral"}
            }
        ]
    })

    response = anthropic_completion_with_backoff(clawd_client,
                                                 model=model,
                                                 max_tokens=max_tokens,
                                                 temperature=temperature,
                                                 system=system,
                                                 messages=messages,
                                                 stream=True
                                                 )

    output_tokens = 0
    input_tokens = 0
    cache_creation_input_tokens = 0
    cache_read_input_tokens = 0
    for chunk in response:
        if chunk.type == "content_block_start":
            # This is where we might find usage info in the future
            pass
        elif chunk.type == "content_block_delta":
            yield dict(text=chunk.delta.text)
        elif chunk.type == "message_delta":
            output_tokens = dict(chunk.usage).get('output_tokens', 0)
        elif chunk.type == "message_start":
            usage = chunk.message.usage
            input_tokens = dict(usage).get('input_tokens', 0)
            cache_creation_input_tokens = dict(usage).get('cache_creation_input_tokens', 0)
            cache_read_input_tokens = dict(usage).get('cache_read_input_tokens', 0)
        else:
            if verbose:
                print("Unknown chunk type:", chunk.type)
                print("Chunk:", chunk)

    if verbose:
        # After streaming is complete, print the usage information
        print(f"Output tokens: {output_tokens}")
        print(f"Input tokens: {input_tokens}")
        print(f"Cache creation input tokens: {cache_creation_input_tokens}")
        print(f"Cache read input tokens: {cache_read_input_tokens}")
    yield dict(output_tokens=output_tokens, input_tokens=input_tokens,
               cache_creation_input_tokens=cache_creation_input_tokens,
               cache_read_input_tokens=cache_read_input_tokens)


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def openai_completion_with_backoff(client, *args, **kwargs):
    return client.chat.completions.create(*args, **kwargs)


def get_openai(model: str,
               prompt: str,
               temperature: float = 0,
               max_tokens: int = 4096,
               system: str = '',
               chat_history: List[Dict] = None,
               secrets: Dict = {},
               verbose=False) -> Generator[dict, None, None]:
    if model.startswith('ollama:'):
        model = model.replace('ollama:', '')
        openai_key = secrets.get('OLLAMA_OPENAI_API_KEY')
        openai_base_url = secrets.get('OLLAMA_OPENAI_BASE_URL', 'http://localhost:11434/v1/')
    else:
        model = model.replace('openai:', '')
        openai_key = secrets.get('OPENAI_API_KEY')
        openai_base_url = secrets.get('OPENAI_BASE_URL', 'https://api.openai.com/v1')

    from openai import OpenAI

    openai_client = OpenAI(api_key=openai_key, base_url=openai_base_url) if openai_key else None

    if chat_history is None:
        chat_history = []
    chat_history_copy = chat_history.copy()
    for mi, message in enumerate(chat_history_copy):
        if isinstance(message["content"], list):
            chat_history_copy[mi]["content"] = message["content"][0]["text"]
    chat_history = chat_history_copy

    messages = [{"role": "system", "content": system}] + chat_history + [{"role": "user", "content": prompt}]

    response = openai_completion_with_backoff(openai_client,
                                              model=model,
                                              messages=messages,
                                              temperature=temperature,
                                              max_tokens=max_tokens,
                                              stream=True,
                                              )

    output_tokens = 0
    input_tokens = 0
    for chunk in response:
        if chunk.choices[0].delta.content:
            yield dict(text=chunk.choices[0].delta.content)
        if chunk.usage:
            output_tokens = chunk.usage.completion_tokens
            input_tokens = chunk.usage.prompt_tokens

    if verbose:
        print(f"Output tokens: {output_tokens}")
        print(f"Input tokens: {input_tokens}")
    yield dict(output_tokens=output_tokens, input_tokens=input_tokens)


def openai_messages_to_gemini_history(messages):
    """Converts OpenAI messages to Gemini history format.

    Args:
        messages: A list of OpenAI messages, each with "role" and "content" keys.

    Returns:
        A list of dictionaries representing the chat history for Gemini.
    """
    history = []
    for message in messages:
        if isinstance(message["content"], list):
            message["content"] = message["content"][0]["text"]
        if message["role"] == "user":
            history.append({"role": "user", "parts": [{"text": message["content"]}]})
        elif message["role"] == "assistant":
            history.append({"role": "model", "parts": [{"text": message["content"]}]})
        # Optionally handle system messages if needed
        # elif message["role"] == "system":
        #     history.append({"role": "system", "parts": [{"text": message["content"]}]})

    return history


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def gemini_send_message_with_backoff(chat, prompt, stream=True):
    return chat.send_message(prompt, stream=stream)


@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def gemini_generate_content_with_backoff(model, prompt, stream=True):
    return model.generate_content(prompt, stream=stream)


def get_google(model: str,
               prompt: str,
               temperature: float = 0,
               max_tokens: int = 4096,
               system: str = '',
               chat_history: List[Dict] = None,
               secrets: Dict = {},
               verbose=False) -> Generator[dict, None, None]:
    model = model.replace('google:', '').replace('gemini:', '')

    import google.generativeai as genai

    gemini_key = secrets.get("GEMINI_API_KEY")
    genai.configure(api_key=gemini_key)
    # Create the model
    generation_config = {
        "temperature": temperature,
        "top_p": 0.95,
        "top_k": 64,
        "max_output_tokens": max_tokens,
        "response_mime_type": "text/plain",
    }

    if chat_history is None:
        chat_history = []

    chat_history = chat_history.copy()
    chat_history = openai_messages_to_gemini_history(chat_history)

    # NOTE: assume want own control.  Too many false positives by Google.
    from google.generativeai.types import HarmCategory
    from google.generativeai.types import HarmBlockThreshold
    safety_settings = {
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    }

    cache = None
    # disable cache for now until work into things well
    use_cache = False
    if use_cache and model == 'gemini-1.5-pro':
        from google.generativeai import caching
        # Estimate token count (this is a rough estimate, you may need a more accurate method)
        estimated_tokens = len(prompt.split()) + sum(len(msg['content'].split()) for msg in chat_history)

        if estimated_tokens > 32000:
            cache = caching.CachedContent.create(
                model=model,
                display_name=f'cache_{datetime.datetime.now().isoformat()}',
                system_instruction=system,
                contents=[prompt] + [msg['content'] for msg in chat_history],
                ttl=datetime.timedelta(minutes=5),  # Set an appropriate TTL.  Short for now for cost savings.
            )
            gemini_model = genai.GenerativeModel.from_cached_content(cached_content=cache)
        else:
            gemini_model = genai.GenerativeModel(model_name=model,
                                                 generation_config=generation_config,
                                                 safety_settings=safety_settings)
    else:
        gemini_model = genai.GenerativeModel(model_name=model,
                                             generation_config=generation_config,
                                             safety_settings=safety_settings)

    if cache:
        response = gemini_generate_content_with_backoff(gemini_model, prompt, stream=True)
    else:
        chat = gemini_model.start_chat(history=chat_history)
        response = gemini_send_message_with_backoff(chat, prompt, stream=True)

    output_tokens = 0
    input_tokens = 0
    cache_read_input_tokens = 0
    cache_creation_input_tokens = 0

    for chunk in response:
        if chunk.text:
            yield dict(text=chunk.text)
        if chunk.usage_metadata:
            output_tokens = chunk.usage_metadata.candidates_token_count
            input_tokens = chunk.usage_metadata.prompt_token_count
            cache_read_input_tokens = chunk.usage_metadata.cached_content_token_count
            cache_creation_input_tokens = 0  # This might need to be updated if available in the API

    if verbose:
        print(f"Output tokens: {output_tokens}")
        print(f"Input tokens: {input_tokens}")
        print(f"Cached tokens: {cache_read_input_tokens}")

    yield dict(output_tokens=output_tokens, input_tokens=input_tokens,
               cache_read_input_tokens=cache_read_input_tokens,
               cache_creation_input_tokens=cache_creation_input_tokens)


def delete_cache(cache):
    if cache:
        cache.delete()
        print(f"Cache {cache.display_name} deleted.")
    else:
        print("No cache to delete.")


def get_groq(model: str,
             prompt: str,
             temperature: float = 0,
             max_tokens: int = 4096,
             system: str = '',
             chat_history: List[Dict] = None,
             secrets: Dict = {},
             verbose=False) -> Generator[dict, None, None]:
    model = model.replace('groq:', '')

    from groq import Groq

    groq_key = secrets.get("GROQ_API_KEY")
    client = Groq(api_key=groq_key)

    if chat_history is None:
        chat_history = []

    chat_history = chat_history.copy()

    messages = [{"role": "system", "content": system}] + chat_history + [{"role": "user", "content": prompt}]

    stream = openai_completion_with_backoff(client,
                                            messages=messages,
                                            model=model,
                                            temperature=temperature,
                                            max_tokens=max_tokens,
                                            stream=True,
                                            )

    output_tokens = 0
    input_tokens = 0
    for chunk in stream:
        if chunk.choices[0].delta.content:
            yield dict(text=chunk.choices[0].delta.content)
        if chunk.usage:
            output_tokens = chunk.usage.completion_tokens
            input_tokens = chunk.usage.prompt_tokens

    if verbose:
        print(f"Output tokens: {output_tokens}")
        print(f"Input tokens: {input_tokens}")
    yield dict(output_tokens=output_tokens, input_tokens=input_tokens)


def get_cerebras(model: str,
                 prompt: str,
                 temperature: float = 0,
                 max_tokens: int = 4096,
                 system: str = '',
                 chat_history: List[Dict] = None,
                 secrets: Dict = {},
                 verbose=False) -> Generator[dict, None, None]:
    # context_length is only 8207
    model = model.replace('cerebras:', '')

    from cerebras.cloud.sdk import Cerebras

    api_key = secrets.get("CEREBRAS_OPENAI_API_KEY")
    client = Cerebras(api_key=api_key)

    if chat_history is None:
        chat_history = []

    chat_history = chat_history.copy()

    messages = [{"role": "system", "content": system}] + chat_history + [{"role": "user", "content": prompt}]

    stream = openai_completion_with_backoff(client,
                                            messages=messages,
                                            model=model,
                                            temperature=temperature,
                                            max_tokens=max_tokens,
                                            stream=True,
                                            )

    output_tokens = 0
    input_tokens = 0
    for chunk in stream:
        if chunk.choices[0].delta.content:
            yield dict(text=chunk.choices[0].delta.content)
        if chunk.usage:
            output_tokens = chunk.usage.completion_tokens
            input_tokens = chunk.usage.prompt_tokens

    if verbose:
        print(f"Output tokens: {output_tokens}")
        print(f"Input tokens: {input_tokens}")
    yield dict(output_tokens=output_tokens, input_tokens=input_tokens)


def get_openai_azure(model: str,
                     prompt: str,
                     temperature: float = 0,
                     max_tokens: int = 4096,
                     system: str = '',
                     chat_history: List[Dict] = None,
                     secrets: Dict = {},
                     verbose=False) -> Generator[dict, None, None]:
    model = model.replace('azure:', '').replace('openai_azure:', '')

    from openai import AzureOpenAI

    azure_endpoint = secrets.get("AZURE_OPENAI_ENDPOINT")  # e.g. https://project.openai.azure.com
    azure_key = secrets.get("AZURE_OPENAI_API_KEY")
    azure_deployment = secrets.get("AZURE_OPENAI_DEPLOYMENT")  # i.e. deployment name with some models deployed
    azure_api_version = secrets.get('AZURE_OPENAI_API_VERSION', '2024-07-01-preview')
    assert azure_endpoint is not None, "Azure OpenAI endpoint not set"
    assert azure_key is not None, "Azure OpenAI API key not set"
    assert azure_deployment is not None, "Azure OpenAI deployment not set"

    client = AzureOpenAI(
        azure_endpoint=azure_endpoint,
        api_key=azure_key,
        api_version=azure_api_version,
        azure_deployment=azure_deployment,
    )

    if chat_history is None:
        chat_history = []

    messages = [{"role": "system", "content": system}] + chat_history + [{"role": "user", "content": prompt}]

    response = openai_completion_with_backoff(client,
                                              model=model,
                                              messages=messages,
                                              temperature=temperature,
                                              max_tokens=max_tokens,
                                              stream=True
                                              )

    output_tokens = 0
    input_tokens = 0
    for chunk in response:
        if chunk.choices and chunk.choices[0].delta.content:
            yield dict(text=chunk.choices[0].delta.content)
        if chunk.usage:
            output_tokens = chunk.usage.completion_tokens
            input_tokens = chunk.usage.prompt_tokens

    if verbose:
        print(f"Output tokens: {output_tokens}")
        print(f"Input tokens: {input_tokens}")
    yield dict(output_tokens=output_tokens, input_tokens=input_tokens)


def to_list(x):
    if x:
        try:
            ollama_model_list = ast.literal_eval(x)
            assert isinstance(ollama_model_list, list)
        except:
            x = [x]
    else:
        x = []
    return x


def get_model_names(secrets, on_hf_spaces=False):
    if not on_hf_spaces:
        secrets = os.environ
    if secrets.get('ANTHROPIC_API_KEY'):
        anthropic_models = ['claude-3-5-sonnet-20240620', 'claude-3-haiku-20240307', 'claude-3-opus-20240229']
    else:
        anthropic_models = []
    if secrets.get('OPENAI_API_KEY'):
        if secrets.get('OPENAI_MODEL_NAME'):
            openai_models = to_list(secrets.get('OPENAI_MODEL_NAME'))
        else:
            openai_models = ['gpt-4o', 'gpt-4-turbo-2024-04-09', 'gpt-4o-mini']
    else:
        openai_models = []
    if secrets.get('AZURE_OPENAI_API_KEY'):
        if secrets.get('AZURE_OPENAI_MODEL_NAME'):
            azure_models = to_list(secrets.get('AZURE_OPENAI_MODEL_NAME'))
        else:
            azure_models = ['gpt-4o', 'gpt-4-turbo-2024-04-09', 'gpt-4o-mini']
    else:
        azure_models = []
    if secrets.get('GEMINI_API_KEY'):
        google_models = ['gemini-1.5-pro-latest', 'gemini-1.5-flash-latest']
    else:
        google_models = []
    if secrets.get('GROQ_API_KEY'):
        groq_models = ['llama-3.1-70b-versatile',
                       'llama-3.1-8b-instant',
                       'llama3-groq-70b-8192-tool-use-preview',
                       'llama3-groq-8b-8192-tool-use-preview',
                       'mixtral-8x7b-32768']
    else:
        groq_models = []
    if secrets.get('CEREBRAS_OPENAI_API_KEY'):
        cerebras_models = ['llama3.1-70b', 'llama3.1-8b']
    else:
        cerebras_models = []
    if secrets.get('OLLAMA_OPENAI_API_KEY'):
        ollama_model = os.environ['OLLAMA_OPENAI_MODEL_NAME']
        ollama_model = to_list(ollama_model)
    else:
        ollama_model = []

    groq_models = ['groq:' + x for x in groq_models]
    cerebras_models = ['cerebras:' + x for x in cerebras_models]
    azure_models = ['azure:' + x for x in azure_models]
    openai_models = ['openai:' + x for x in openai_models]
    google_models = ['google:' + x for x in google_models]
    anthropic_models = ['anthropic:' + x for x in anthropic_models]
    ollama = ['ollama:' + x if 'ollama:' not in x else x for x in ollama_model]

    return anthropic_models + openai_models + google_models + groq_models + cerebras_models + azure_models + ollama


def get_model_api(model: str):
    assert model not in ['', None], "Model not set, need to add API key to have models appear and select one."
    if model.startswith('anthropic:'):
        return get_anthropic
    elif model.startswith('openai:') or model.startswith('ollama:'):
        return get_openai
    elif model.startswith('google:'):
        return get_google
    elif model.startswith('groq:'):
        return get_groq
    elif model.startswith('cerebras:'):
        return get_cerebras
    elif model.startswith('azure:'):
        return get_openai_azure
    else:
        raise ValueError(
            f"Unsupported model: {model}.  Ensure to add prefix (e.g. openai:, google:, groq:, cerebras:, azure:, ollama:, anthropic:)")