File size: 1,586 Bytes
06bf5a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3ec1fd
06bf5a5
 
 
b3ec1fd
06bf5a5
 
 
 
b3ec1fd
 
06bf5a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3ec1fd
06bf5a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from huggingface_hub import InferenceClient
from auditqa.process_chunks import getconfig
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.chat_models.huggingface import ChatHuggingFace
import os
from dotenv import load_dotenv
load_dotenv()

model_config = getconfig("model_params.cfg")
NVIDIA_SERVER = os.environ["NVIDIA_SERVERLESS"]
HF_token = os.environ["LLAMA_3_1"]


def nvidia_client():
    """ returns the nvidia server client """
    client = InferenceClient(
    base_url=model_config.get('reader','NVIDIA_ENDPOINT'),
    api_key=NVIDIA_SERVER)
    print("getting nvidia client")

    return client

def dedicated_endpoint():
    """ returns the dedicated server endpoint"""

     # Set up the streaming callback handler
    callback = StreamingStdOutCallbackHandler()

    # Initialize the HuggingFaceEndpoint with streaming enabled
    llm_qa = HuggingFaceEndpoint(
        endpoint_url=model_config.get('reader', 'DEDICATED_ENDPOINT'),
        max_new_tokens=int(model_config.get('reader','MAX_TOKENS')),
        repetition_penalty=1.03,
        timeout=70,
        huggingfacehub_api_token=HF_token,
        streaming=True, # Enable streaming for real-time token generation
        callbacks=[callback] # Add the streaming callback handler
    )
    
    # Create a ChatHuggingFace instance with the streaming-enabled endpoint
    chat_model = ChatHuggingFace(llm=llm_qa)
    print("getting dedicated endpoint wrapped in ChathuggingFace ")
    return chat_model