Spaces:
Sleeping
Sleeping
File size: 9,812 Bytes
5b3e513 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
path_work = "."
# hf_token
from dotenv import load_dotenv
load_dotenv()
import os
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# [์ ํ1] ๊ฑฐ๋๋ชจ๋ธ ๋ญ์ฒด์ธ Custom LLM (HF InferenceClient) - 70B๊ฐ ๋ฌด๋ฃ!!!, openai๋ณด๋ค ์ฑ๋ฅ ์๋จ์ด์ง (์คํธ๋ฆฌ๋ฐ์ ์์ง ์๋จ)
# model_name = "tiiuae/falcon-180B-chat"
model_name="meta-llama/Llama-2-70b-chat-hf"
# model_name="NousResearch/Llama-2-70b-chat-hf"
# model_name="meta-llama/Llama-2-13b-chat-hf"
# model_name="meta-llama/Llama-2-7b-chat-hf"
# model_name = "HuggingFaceH4/zephyr-7b-alpha"
kwargs = {"max_new_tokens":256, "temperature":0.9, "top_p":0.6, "repetition_penalty":1.3, "do_sample":True}
# ์ปค์คํ
LLM
from pydantic import BaseModel, Field
from typing import Any, Optional, Dict, List
from huggingface_hub import InferenceClient
from langchain.llms.base import LLM
class CustomInferenceClient(LLM, KwArgsModel):
model_name: str
inference_client: InferenceClient
def __init__(self, model_name: str, hf_token: str, kwargs: Optional[Dict[str, Any]] = None):
inference_client = InferenceClient(model=model_name, token=hf_token)
super().__init__(
model_name=model_name,
hf_token=hf_token,
kwargs=kwargs,
inference_client=inference_client # inference_client ์ธ์ ์ถ๊ฐ
)
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None
) -> str:
if stop is not None:
raise ValueError("stop kwargs are not permitted.")
# pdb.set_trace()
# response_gen = self.__dict__['client'].text_generation(prompt, stream=True, **self.kwargs) # ์ ์ฅ๋ kwargs๋ฅผ ์ฌ์ฉ,
response_gen = self.inference_client.text_generation(prompt, **self.kwargs, stream=True)
response = ''.join(response_gen) # ์ ๋๋ ์ดํฐ์ ๋ชจ๋ ๊ฐ์ ๋ฌธ์์ด๋ก ์ฐ๊ฒฐ
return response
@property
def _llm_type(self) -> str:
return "custom"
@property
def _identifying_params(self) -> dict:
return {"model_name": self.model_name}
# ์ฌ์ฉ ์์ :
# prompt="How do you make cheese?"
# prompt = "Tell me the names of the last 10 U.S. presidents"
prompt="Tell me 10 of the world's largest buildings in high order"
llm = CustomInferenceClient(model_name=model_name, hf_token=hf_token, kwargs=kwargs) # hf_token ์ฌ์ฉํ๋ ๊ฒฝ์ฐ
# llm = CustomInferenceClient(model_name=model_name, kwargs=kwargs) # hf_token ์ฌ์ฉํ์ง ์๋ ๊ฒฝ์ฐ
# ์๋ฒ ๋ฉ ๊ฐ์ฒด ์์ฑ
from langchain.embeddings import HuggingFaceInstructEmbeddings
embeddings = HuggingFaceInstructEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cuda"})
# ๋ฒกํฐDB ๋ก๋
path_work ='.'
from langchain.vectorstores import Chroma
vectordb = Chroma(
persist_directory = path_work + '/cromadb_llama2-papers',
embedding_function=embeddings)
retriever = vectordb.as_retriever(search_kwargs={"k": 5})
# RetrievalQA ์ฒด์ธ ๋ง๋ค๊ธฐ
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
# llm=OpenAI(), # from langchain.llms import OpenAI
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
verbose=True,
)
qa_chain
# ๊ทธ๋ผ๋์ค
import json
import os
import gradio as gr
# Stream text
def predict(message, chatbot, temperature=0.9, max_new_tokens=512, top_p=0.6, repetition_penalty=1.3,):
temperature = float(temperature)
if temperature < 1e-2: temperature = 1e-2
top_p = float(top_p)
# ํ๋กฌํํธ
# system_message = "\nYou are a psychological counselor who gives friendly and professional counseling on the concerns of Korean clients."
# input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
# for interaction in chatbot:
# input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
# input_prompt = input_prompt + str(message) + " [/INST] "
# conversationalRetrievalChain (ํ์คํ ๋ฆฌ๊ฐ ์ฒด์ธ ๋ด์ฅ ํ๋กฌํํธ์ ์ธํ๋จ)
# chat_history = []
# for interaction in chatbot:
# chat_history = chat_history + [(str(interaction[0]), str(interaction[1]))]
# llm_response = qa_chain_conv({"question": message, "chat_history": chat_history})
# res_result = llm_response['answer']
# RetrievalQA ์ฒด์ธ (ํ์คํ ๋ฆฌ๊ฐ ์ฒด์ธ ๋ด์ฅ ํ๋กฌํํธ์ ์ธํ ์๋จ)
llm_response = qa_chain(message)
res_result = llm_response['result']
# conversationalRetrievalChain, RetrievalQA ์ฒด์ธ ๊ณตํต
res_relevant_doc = [source.metadata['source'] for source in llm_response["source_documents"]]
response = f"{res_result}" + "\n\n" + "[๋ต๋ณ ๊ทผ๊ฑฐ ์์ค ๋
ผ๋ฌธ (ctrl + click ํ์ธ์!)] :" + "\n" + f" \n {res_relevant_doc}"
print("response: =====> \n", response, "\n\n")
#3) json ํํ๋ก ๋ณํ (api response์ ๊ฐ์ ํํ)
import json
tokens = response.split('\n')
token_list = []
for idx, token in enumerate(tokens):
token_dict = {"id": idx + 1, "text": token}
token_list.append(token_dict)
response = {"data": {"token": token_list}}
response = json.dumps(response, indent=4)
'''{'data': {'token': [{'id': 1, 'text': 'Artificial intelligence (AI) refers to...'},
{'id': 2, 'text': 'I hope this information helher questions!'}]}}'''
# ===========================================================================
# ์คํธ๋ฆฌ๋ฐ ์์ (partial_message)
response = json.loads(response) # {'data': {'token': [{'id': 1, 'text': '๋ต๋ณ์ " ์๋
ํ์ธ์. ์ ๋ ์ก์์ง ๋ฐ์ฌ.....
data_dict = response.get('data', {})
token_list = data_dict.get('token', [])
import time
partial_message = ""
# ํ์ด๋ผ์ดํธ: .iter_lines() ๋์ ์ token_list๋ฅผ ์ง์ ์ํํฉ๋๋ค.
for token_entry in token_list:
if token_entry: # filter out keep-alive new lines (if any)
try:
# ํ์ด๋ผ์ดํธ: ์ง์ ์ฌ์ ์์ 'id'์ 'text'๋ฅผ ์ถ์ถํฉ๋๋ค.
token_id = token_entry.get('id', None)
token_text = token_entry.get('text', None)
# time.sleep์ผ๋ก ๊ธ์ ์๋ ์กฐ์ ํ๋ฉฐ ๊ธ์ ๋ด๋ณด๋
if token_text: # ์ด ๋ถ๋ถ์ ์ํ๋ ๋๋ก ์กฐ์ ํ ์ ์์ต๋๋ค.
# partial_message = partial_message + token_text
for char in token_text: # ๋ฌธ์ ํ๋์ฉ ์ํ (์ถ๊ฐ๋จ)
partial_message += char # partial_message์ ๋ฌธ์ ์ถ๊ฐ (๋ณ๊ฒฝ๋จ)
yield partial_message
time.sleep(0.01)
else:
# gr.Warning(f"The key 'text' does not exist or is None in this token entry: {token_entry}")
print(f"[[์๋]] ==> The key 'text' does not exist or is None in this token entry: {token_entry}")
except KeyError as e:
gr.Warning(f"KeyError: {e} occurred for token entry: {token_entry}")
continue
# ํ์ดํ/์ค๋ช
/์ง๋ฌธ์์
title = "llama-2 ๋ชจ๋ธ ๊ด๋ จ ๋
ผ๋ฌธ QA ์๋น์ค"
description = """chat history ์ ์ง ๋ณด๋ค๋ QA์ ์ถฉ์คํ๋๋ก ์ ์๋์์ผ๋ Single turn์ผ๋ก ํ์ฉ์ ํ์ฌ ์ฃผ์ธ์. (chat history ํ์ฉ์ ๋ค๋ฅธ ์ฃผ์ ๋ก ๋ณ๋ ์ ์ ์์ )"""
css = """.toast-wrap { display: none !important } """
examples=[['Can you tell me about the llama-2 model?'],['What is percent accuracy, using the SPP layer as features on the SPP (ZF-5) model?'], ['What is percent accuracy, using the SPP layer as features on the SPP (ZF-5) model?'], ["tell me about method for human pose estimation based on DNNs"]]
# ์ข์์
import gradio as gr
def vote(data: gr.LikeData):
if data.liked: print("You upvoted this response: " + data.value)
else: print("You downvoted this response: " + data.value)
# ๊ทธ๋ผ๋์ค (์ธ์ ์กฐ์ )
additional_inputs = [
# gr.Textbox("", label="Optional system prompt"),
gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=4096, step=64, interactive=True, info="The maximum numbers of new tokens"),
gr.Slider(label="Top-p (nucleus sampling)", value=0.6, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
gr.Slider(label="Repetition penalty", value=1.2, minimum=1.0, maximum=2.0, step=0.05, interactive=True, info="Penalize repeated tokens")
]
chatbot_stream = gr.Chatbot(avatar_images=(
"https://drive.google.com/uc?id=13rYrN0cH_9tR7GveqO1q2JiyBCqkfCLZ", # https://drive.google.com/uc?id= ๋ค์ ID๊ฐ๋ง (๋ชจ๋ ์ฌ์ฉ์ ์ก์ธ์ค ๊ถํ ํ์ฉ)
"https://drive.google.com/uc?id=1tfELAQW_VbPCy6QTRbexRlwAEYo8rSSv"
), bubble_full_width = False)
chat_interface_stream = gr.ChatInterface(predict,
title=title,
description=description,
# textbox=gr.Textbox(lines=5),
chatbot=chatbot_stream,
css=css,
examples=examples,
# cache_examples=True,
# additional_inputs=additional_inputs,
)
# Gradio Demo
with gr.Blocks() as demo:
with gr.Tab("์คํธ๋ฆฌ๋ฐ"):
#gr.ChatInterface(predict, title=title, description=description, css=css, examples=examples, cache_examples=True, additional_inputs=additional_inputs,)
chatbot_stream.like(vote, None, None)
chat_interface_stream.render()
demo.queue(concurrency_count=75, max_size=100).launch(debug=True)
|