Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,586 Bytes
06bf5a5 b3ec1fd 06bf5a5 b3ec1fd 06bf5a5 b3ec1fd 06bf5a5 b3ec1fd 06bf5a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from huggingface_hub import InferenceClient
from auditqa.process_chunks import getconfig
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.chat_models.huggingface import ChatHuggingFace
import os
from dotenv import load_dotenv
load_dotenv()
model_config = getconfig("model_params.cfg")
NVIDIA_SERVER = os.environ["NVIDIA_SERVERLESS"]
HF_token = os.environ["LLAMA_3_1"]
def nvidia_client():
""" returns the nvidia server client """
client = InferenceClient(
base_url=model_config.get('reader','NVIDIA_ENDPOINT'),
api_key=NVIDIA_SERVER)
print("getting nvidia client")
return client
def dedicated_endpoint():
""" returns the dedicated server endpoint"""
# Set up the streaming callback handler
callback = StreamingStdOutCallbackHandler()
# Initialize the HuggingFaceEndpoint with streaming enabled
llm_qa = HuggingFaceEndpoint(
endpoint_url=model_config.get('reader', 'DEDICATED_ENDPOINT'),
max_new_tokens=int(model_config.get('reader','MAX_TOKENS')),
repetition_penalty=1.03,
timeout=70,
huggingfacehub_api_token=HF_token,
streaming=True, # Enable streaming for real-time token generation
callbacks=[callback] # Add the streaming callback handler
)
# Create a ChatHuggingFace instance with the streaming-enabled endpoint
chat_model = ChatHuggingFace(llm=llm_qa)
print("getting dedicated endpoint wrapped in ChathuggingFace ")
return chat_model |