File size: 1,490 Bytes
d09dce6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import SentenceTransformerEmbeddings

import os
from dotenv import load_dotenv
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from kiwipiepy import Kiwi
load_dotenv()

kiwi = Kiwi()

def kiwi_tokenize(text):
    return [token.form for token in kiwi.tokenize(text)]
# embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True}) 

def retriever(pc, bm25):
    pcretriever = pc.as_retriever(search_kwargs={'k':4})
    kiwi_bm25 = BM25Retriever.from_documents(bm25,preprocess_func=kiwi_tokenize)
    kiwi_bm25.k=4
    
    kiwibm25_pc_37 = EnsembleRetriever(
        retrievers=[kiwi_bm25, pcretriever],  # ์‚ฌ์šฉํ•  ๊ฒ€์ƒ‰ ๋ชจ๋ธ์˜ ๋ฆฌ์ŠคํŠธ
        weights=[0.3, 0.7],  # ๊ฐ ๊ฒ€์ƒ‰ ๋ชจ๋ธ์˜ ๊ฒฐ๊ณผ์— ์ ์šฉํ•  ๊ฐ€์ค‘์น˜
        search_type="mmr",  # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์˜ ๋‹ค์–‘์„ฑ์„ ์ฆ์ง„์‹œํ‚ค๋Š” MMR ๋ฐฉ์‹์„ ์‚ฌ์šฉ
    ) 
        # Pinecone vector store ์ดˆ๊ธฐํ™”  
    # vectorstore = PineconeVectorStore(  
    # index_name=os.getenv("INDEX_NAME"), embedding=embedding_model
    # )  

    # retriever = vectorstore.as_retriever(search_kwargs={'k': 2})

    return kiwibm25_pc_37