Spaces:
Sleeping
Sleeping
Create retriever.py
Browse files- retriever.py +37 -0
retriever.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.runnables import RunnablePassthrough
|
2 |
+
from langchain_core.output_parsers import StrOutputParser
|
3 |
+
from langchain_community.chat_models import ChatOllama
|
4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
5 |
+
from langchain_pinecone import PineconeVectorStore
|
6 |
+
from langchain_community.embeddings import SentenceTransformerEmbeddings
|
7 |
+
|
8 |
+
import os
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from langchain.retrievers import BM25Retriever, EnsembleRetriever
|
11 |
+
from kiwipiepy import Kiwi
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
kiwi = Kiwi()
|
15 |
+
|
16 |
+
def kiwi_tokenize(text):
|
17 |
+
return [token.form for token in kiwi.tokenize(text)]
|
18 |
+
# embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True})
|
19 |
+
|
20 |
+
def retriever(pc, bm25):
|
21 |
+
pcretriever = pc.as_retriever(search_kwargs={'k':4})
|
22 |
+
kiwi_bm25 = BM25Retriever.from_documents(bm25,preprocess_func=kiwi_tokenize)
|
23 |
+
kiwi_bm25.k=4
|
24 |
+
|
25 |
+
kiwibm25_pc_37 = EnsembleRetriever(
|
26 |
+
retrievers=[kiwi_bm25, pcretriever], # ์ฌ์ฉํ ๊ฒ์ ๋ชจ๋ธ์ ๋ฆฌ์คํธ
|
27 |
+
weights=[0.3, 0.7], # ๊ฐ ๊ฒ์ ๋ชจ๋ธ์ ๊ฒฐ๊ณผ์ ์ ์ฉํ ๊ฐ์ค์น
|
28 |
+
search_type="mmr", # ๊ฒ์ ๊ฒฐ๊ณผ์ ๋ค์์ฑ์ ์ฆ์ง์ํค๋ MMR ๋ฐฉ์์ ์ฌ์ฉ
|
29 |
+
)
|
30 |
+
# Pinecone vector store ์ด๊ธฐํ
|
31 |
+
# vectorstore = PineconeVectorStore(
|
32 |
+
# index_name=os.getenv("INDEX_NAME"), embedding=embedding_model
|
33 |
+
# )
|
34 |
+
|
35 |
+
# retriever = vectorstore.as_retriever(search_kwargs={'k': 2})
|
36 |
+
|
37 |
+
return kiwibm25_pc_37
|