testbot_v4 / pdf_loader.py
soyleyicicem's picture
Upload 8 files
af9408a verified
import json
import os
import uuid
import datetime
from qdrant_client import QdrantClient, models
from langchain_core.load import dumpd, dumps, load, loads
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import NLTKTextSplitter, RecursiveCharacterTextSplitter
from langchain_qdrant import Qdrant
class PDFLoader:
def __init__(self):
pass
def pdf_reader(self, path):
key = None # change it with your own loader key
endpoint = None # change it with your own loader endpoint
analysis_features = ["ocrHighResolution"]
# PDF Loader
AzurePDFLoader = AzureAIDocumentIntelligenceLoader(
api_endpoint=endpoint,
api_key=key,
file_path=path,
api_model="prebuilt-layout",
mode="page",
analysis_features=analysis_features
)
documents = AzurePDFLoader.load()
return documents
def save_raw_documents(self, path, name, documents):
log_file={"documents": dumpd(documents)}
log_file_name = os.path.join(path, name)
with open(log_file_name, 'w') as output_file:
print(json.dumps(log_file, indent=2), file=output_file)
def load_raw_documents(self, path, name):
log_file_name = os.path.join(path, name)
with open(log_file_name, 'rb') as output_file:
log_file= json.load(output_file)
documents = load(log_file["documents"])
return documents
def recursive_splitter(self, documents, chunk_size=1024, chunk_overlap=256):
# Splitter
mySplitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
add_start_index=False)
chunks = mySplitter.split_documents(documents)
return chunks
def generate_vectors(self, chunks, embeddings, source_name):
vectors = []
metadatas = []
page_contents = []
for chunk in chunks:
page_contents.append(chunk.page_content)
vector = embeddings.embed_documents([chunk.page_content])
vectors.append(vector)
meta = chunk.metadata
meta["source"] = source_name
metadatas.append(meta)
return page_contents, vectors, metadatas
def save_to_database(self, chunks, embeddings, collection_name):
qdrant = Qdrant.from_documents(
chunks,
embeddings,
url=os.getenv('qdrant_url'),
api_key=os.getenv('qdrant_api'),
prefer_grpc=True,
collection_name=collection_name)
def load_from_database(self, embeddings, collection_name):
db = Qdrant.from_existing_collection(
embedding=embeddings,
url=os.getenv('qdrant_url'),
api_key=os.getenv('qdrant_api'),
collection_name=collection_name)
return db
def save_manuals(self, client, collection_name, car_id, model_year, vectors, metadatas, page_contents):
client.upsert(
collection_name=collection_name,
points=[
models.PointStruct(
id=uuid.uuid4().hex,
payload={"metadata":metadatas[idx],
"page_content": page_contents[idx],
"car_id": car_id,
"model_year": model_year,
"create_date": datetime.datetime.now().isoformat()},
vector=vector[0]
)
for idx, vector in enumerate(vectors)
],
)