aie4-midterm-task2 / utils.py
dstampfli's picture
Create utils.py
3590298 verified
raw
history blame
3.77 kB
from langchain_community.document_loaders import PyMuPDFLoader
def load_pdfs(paths: list) -> list:
# List of file paths for the PDFs you want to load
paths = paths
# Create a list to store loaded documents
documents = []
# Loop through each PDF and load it
for path in paths:
loader = PyMuPDFLoader(path)
documents.extend(loader.load()) # Add the documents to the list
return documents
#####
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_docs_recursive(documents: list, chunk_size: int, chunk_overlap: int) -> list:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = text_splitter.split_documents(documents)
return chunks
#####
from langchain.text_splitter import NLTKTextSplitter
def chunk_docs_nltk(documents: list, chunk_size: int, chunk_overlap: int) -> list:
text_splitter = NLTKTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
chunks = text_splitter.split_documents(documents)
return chunks
#####
# from langchain_openai import OpenAIEmbeddings
# def create_embeddings_openai(model: str) -> OpenAIEmbeddings:
# # Initialize the OpenAIEmbeddings class
# embeddings = OpenAIEmbeddings(model=model)
# return embeddings
#####
from langchain_huggingface import HuggingFaceEmbeddings
def create_embeddings_opensource(model: str) -> HuggingFaceEmbeddings:
# Initialize the OpenAIEmbeddings class
embeddings = HuggingFaceEmbeddings(model_name=model)
return embeddings
#####
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
def create_vector_store(location: str, collection_name: str, vector_size: int, embeddings, documents: list) -> QdrantVectorStore:
# Initialize the Qdrant client
qdrant_client = QdrantClient(
location=location
)
# Create a collection in Qdrant
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=vector_size,
distance=Distance.COSINE
)
)
# Initialize QdrantVectorStore with the Qdrant client
qdrant_vector_store = QdrantVectorStore(
client=qdrant_client,
collection_name=collection_name,
embedding=embeddings,
)
qdrant_vector_store.add_documents(documents)
return qdrant_vector_store
#####
def create_retriever_from_qdrant(vector_store: QdrantVectorStore):
retriever = vector_store.as_retriever()
return retriever
#####
from langchain.prompts import ChatPromptTemplate
def create_chat_prompt_template() -> ChatPromptTemplate:
template = """
Only answer the question using the context below. If the answer can't be found in the context, respond "I don't know".
Question:
{question}
Context:
{context}
"""
prompt = ChatPromptTemplate.from_template(template)
return prompt
#####
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from operator import itemgetter
def create_chain_openai(model: str, prompt: ChatPromptTemplate, retriever):
llm = ChatOpenAI(
model_name="gpt-4o-mini",
temperature=0
)
chain = (
{"context": itemgetter("question") | retriever, "question": itemgetter("question")}
| RunnablePassthrough.assign(context=itemgetter("context"))
| {"response": prompt | llm, "context": itemgetter("context")}
)
return chain
#####