Spaces:
Sleeping
Sleeping
File size: 3,809 Bytes
39b06ad eaa707f 39b06ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
from langchain_community.document_loaders import PyMuPDFLoader
def load_pdfs(paths: list) -> list:
# List of file paths for the PDFs you want to load
paths = paths
# Create a list to store loaded documents
documents = []
# Loop through each PDF and load it
for path in paths:
loader = PyMuPDFLoader(path)
documents.extend(loader.load()) # Add the documents to the list
return documents
#####
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_docs_recursive(documents: list, chunk_size: int, chunk_overlap: int) -> list:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(documents)
return chunks
#####
from langchain.text_splitter import NLTKTextSplitter
import nltk
nltk.download('punkt_tab')
def chunk_docs_nltk(documents: list, chunk_size: int, chunk_overlap: int) -> list:
text_splitter = NLTKTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(documents)
return chunks
#####
# from langchain_openai import OpenAIEmbeddings
# def create_embeddings_openai(model: str) -> OpenAIEmbeddings:
# # Initialize the OpenAIEmbeddings class
# embeddings = OpenAIEmbeddings(model=model)
# return embeddings
#####
from langchain_huggingface import HuggingFaceEmbeddings
def create_embeddings_opensource(model: str) -> HuggingFaceEmbeddings:
# Initialize the OpenAIEmbeddings class
embeddings = HuggingFaceEmbeddings(model_name=model)
return embeddings
#####
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
def create_vector_store(location: str, collection_name: str, vector_size: int, embeddings, documents: list) -> QdrantVectorStore:
# Initialize the Qdrant client
qdrant_client = QdrantClient(
location=location
)
# Create a collection in Qdrant
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
)
# Initialize QdrantVectorStore with the Qdrant client
qdrant_vector_store = QdrantVectorStore(
client=qdrant_client,
collection_name=collection_name,
embedding=embeddings,
)
qdrant_vector_store.add_documents(documents)
return qdrant_vector_store
#####
def create_retriever_from_qdrant(vector_store: QdrantVectorStore):
retriever = vector_store.as_retriever()
return retriever
#####
from langchain.prompts import ChatPromptTemplate
def create_chat_prompt_template() -> ChatPromptTemplate:
template = """
Only answer the question using the context below. If the answer can't be found in the context, respond "I don't know".
Question:
{question}
Context:
{context}
"""
prompt = ChatPromptTemplate.from_template(template)
return prompt
#####
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from operator import itemgetter
def create_chain_openai(model: str, prompt: ChatPromptTemplate, retriever):
llm = ChatOpenAI(
model_name="gpt-4o-mini",
temperature=0
)
chain = (
{"context": itemgetter("question") | retriever, "question": itemgetter("question")}
| RunnablePassthrough.assign(context=itemgetter("context"))
| {"response": prompt | llm, "context": itemgetter("context")}
)
return chain
##### |