from llama_index.llms.huggingface import HuggingFaceLLM, HuggingFaceInferenceAPI from llama_index.llms.openai import OpenAI from llama_index.llms.replicate import Replicate from dotenv import load_dotenv import os import streamlit as st load_dotenv() # download the model from the Hugging Face Hub and run it locally # llm_mixtral_8x7b = HuggingFaceLLM(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1") # llm_llama_2_7b_chat = HuggingFaceInferenceAPI( # model_name="meta-llama/Llama-2-7b-chat-hf", # token=os.getenv("HUGGINGFACE_API_TOKEN"), # ) # dict = {"source": "model_name"} integrated_llms = { "gpt-3.5-turbo-0125": "openai", "meta/llama-2-13b-chat": "replicate", "mistralai/Mistral-7B-Instruct-v0.2": "huggingface", # "mistralai/Mixtral-8x7B-v0.1": "huggingface", # 93 GB model # "meta-llama/Meta-Llama-3-8B": "huggingface", # too large >10G for llama index hf interference to load } def load_llm(model_name: str, source: str = "huggingface"): print("model_name: ", model_name, "source: ", source) if integrated_llms.get(model_name) is None: return None try: if source.startswith("openai"): llm_gpt_3_5_turbo_0125 = OpenAI( model=model_name, api_key=st.session_state.openai_api_key, temperature=0.0, ) return llm_gpt_3_5_turbo_0125 elif source.startswith("replicate"): llm_llama_13b_v2_replicate = Replicate( model=model_name, is_chat_model=True, additional_kwargs={"max_new_tokens": 250}, prompt_key=st.session_state.replicate_api_token, temperature=0.0, ) return llm_llama_13b_v2_replicate elif source.startswith("huggingface"): llm_mixtral_8x7b = HuggingFaceInferenceAPI( model_name=model_name, token=st.session_state.hf_token, ) return llm_mixtral_8x7b except Exception as e: print(e)