import json import os import gradio as gr import time from pydantic import BaseModel, Field from typing import Any, Optional, Dict, List from huggingface_hub import InferenceClient from langchain.llms.base import LLM from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.vectorstores import Chroma import os from dotenv import load_dotenv load_dotenv() path_work = "." hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") embeddings = HuggingFaceInstructEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"} ) vectordb = Chroma( persist_directory = path_work + '/cromadb_llama2-papers', embedding_function=embeddings) retriever = vectordb.as_retriever(search_kwargs={"k": 5}) class KwArgsModel(BaseModel): kwargs: Dict[str, Any] = Field(default_factory=dict) class CustomInferenceClient(LLM, KwArgsModel): model_name: str inference_client: InferenceClient def __init__(self, model_name: str, hf_token: str, kwargs: Optional[Dict[str, Any]] = None): inference_client = InferenceClient(model=model_name, token=hf_token) super().__init__( model_name=model_name, hf_token=hf_token, kwargs=kwargs, inference_client=inference_client ) def _call( self, prompt: str, stop: Optional[List[str]] = None ) -> str: if stop is not None: raise ValueError("stop kwargs are not permitted.") response_gen = self.inference_client.text_generation(prompt, **self.kwargs, stream=True) response = ''.join(response_gen) return response @property def _llm_type(self) -> str: return "custom" @property def _identifying_params(self) -> dict: return {"model_name": self.model_name} kwargs = {"max_new_tokens":256, "temperature":0.9, "top_p":0.6, "repetition_penalty":1.3, "do_sample":True} model_list=[ "meta-llama/Llama-2-13b-chat-hf", "HuggingFaceH4/zephyr-7b-alpha", "meta-llama/Llama-2-70b-chat-hf", "tiiuae/falcon-180B-chat" ] qa_chain = None def load_model(model_selected): global qa_chain model_name = model_selected llm = CustomInferenceClient(model_name=model_name, hf_token=hf_token, kwargs=kwargs) from langchain.chains import RetrievalQA qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, verbose=True, ) qa_chain load_model("meta-llama/Llama-2-70b-chat-hf") def model_select(model_selected): load_model(model_selected) return f"모델 {model_selected} 로딩 완료." def predict(message, chatbot, temperature=0.9, max_new_tokens=512, top_p=0.6, repetition_penalty=1.3,): temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) llm_response = qa_chain(message) res_result = llm_response['result'] res_relevant_doc = [source.metadata['source'] for source in llm_response["source_documents"]] response = f"{res_result}" + "\n\n" + "[답변 근거 소스 논문 (ctrl + click 하세요!)] :" + "\n" + f" \n {res_relevant_doc}" print("response: =====> \n", response, "\n\n") tokens = response.split('\n') token_list = [] for idx, token in enumerate(tokens): token_dict = {"id": idx + 1, "text": token} token_list.append(token_dict) response = {"data": {"token": token_list}} response = json.dumps(response, indent=4) response = json.loads(response) data_dict = response.get('data', {}) token_list = data_dict.get('token', []) partial_message = "" for token_entry in token_list: if token_entry: try: token_id = token_entry.get('id', None) token_text = token_entry.get('text', None) if token_text: for char in token_text: partial_message += char yield partial_message time.sleep(0.01) else: print(f"[[워닝]] ==> The key 'text' does not exist or is None in this token entry: {token_entry}") pass except KeyError as e: gr.Warning(f"KeyError: {e} occurred for token entry: {token_entry}") continue title = "Llama-2 모델 관련 논문 Generative QA (with RAG) 서비스 (Llama-2-70b 모델 등 활용)" description = """Chat history 유지 보다는 QA에 충실하도록 제작되었으므로 Single turn으로 활용 하여 주세요. Default로 Llama-2 70b 모델로 설정되어 있으나 GPU 서비스 한도 초과로 Error가 발생할 수 있으니 양해부탁드리며, 화면 하단의 모델 변경/로딩하시어 다른 모델로 변경하여 사용을 부탁드립니다. (다만, Llama-2 70b가 가장 정확하오니 참고하여 주시기 바랍니다.) """ css = """.toast-wrap { display: none !important } """ examples=[['Can you tell me about the llama-2 model?'],['What is percent accuracy, using the SPP layer as features on the SPP (ZF-5) model?'], ["How much less accurate is using the SPP layer as features on the SPP (ZF-5) model compared to using the same model on the undistorted full image?"], ["tell me about method for human pose estimation based on DNNs"]] def vote(data: gr.LikeData): if data.liked: print("You upvoted this response: " + data.value) else: print("You downvoted this response: " + data.value) additional_inputs = [ gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"), gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=4096, step=64, interactive=True, info="The maximum numbers of new tokens"), gr.Slider(label="Top-p (nucleus sampling)", value=0.6, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"), gr.Slider(label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens") ] chatbot_stream = gr.Chatbot(avatar_images=( "https://drive.google.com/uc?id=18xKoNOHN15H_qmGhK__VKnGjKjirrquW", "https://drive.google.com/uc?id=1tfELAQW_VbPCy6QTRbexRlwAEYo8rSSv" ), bubble_full_width = False) chat_interface_stream = gr.ChatInterface( predict, title=title, description=description, chatbot=chatbot_stream, css=css, examples=examples, ) with gr.Blocks() as demo: with gr.Tab("스트리밍"): chatbot_stream.like(vote, None, None) chat_interface_stream.render() with gr.Row(): with gr.Column(scale=6): with gr.Row(): model_selector = gr.Dropdown(model_list, label="모델 선택", value= "meta-llama/Llama-2-70b-chat-hf", scale=5) submit_btn1 = gr.Button(value="모델 로드", scale=1) with gr.Column(scale=4): model_status = gr.Textbox(value="", label="모델 상태") submit_btn1.click(model_select, inputs=[model_selector], outputs=[model_status]) demo.queue(concurrency_count=75, max_size=100).launch(debug=True,share=True)