import json import os import uuid import datetime from qdrant_client import QdrantClient, models from langchain_core.load import dumpd, dumps, load, loads from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader from langchain.text_splitter import NLTKTextSplitter, RecursiveCharacterTextSplitter from langchain_qdrant import Qdrant class PDFLoader: def __init__(self): pass def pdf_reader(self, path): key = None # change it with your own loader key endpoint = None # change it with your own loader endpoint analysis_features = ["ocrHighResolution"] # PDF Loader AzurePDFLoader = AzureAIDocumentIntelligenceLoader( api_endpoint=endpoint, api_key=key, file_path=path, api_model="prebuilt-layout", mode="page", analysis_features=analysis_features ) documents = AzurePDFLoader.load() return documents def save_raw_documents(self, path, name, documents): log_file={"documents": dumpd(documents)} log_file_name = os.path.join(path, name) with open(log_file_name, 'w') as output_file: print(json.dumps(log_file, indent=2), file=output_file) def load_raw_documents(self, path, name): log_file_name = os.path.join(path, name) with open(log_file_name, 'rb') as output_file: log_file= json.load(output_file) documents = load(log_file["documents"]) return documents def recursive_splitter(self, documents, chunk_size=1024, chunk_overlap=256): # Splitter mySplitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=False) chunks = mySplitter.split_documents(documents) return chunks def generate_vectors(self, chunks, embeddings, source_name): vectors = [] metadatas = [] page_contents = [] for chunk in chunks: page_contents.append(chunk.page_content) vector = embeddings.embed_documents([chunk.page_content]) vectors.append(vector) meta = chunk.metadata meta["source"] = source_name metadatas.append(meta) return page_contents, vectors, metadatas def save_to_database(self, chunks, embeddings, collection_name): qdrant = Qdrant.from_documents( chunks, embeddings, url=os.getenv('qdrant_url'), api_key=os.getenv('qdrant_api'), prefer_grpc=True, collection_name=collection_name) def load_from_database(self, embeddings, collection_name): db = Qdrant.from_existing_collection( embedding=embeddings, url=os.getenv('qdrant_url'), api_key=os.getenv('qdrant_api'), collection_name=collection_name) return db def save_manuals(self, client, collection_name, car_id, model_year, vectors, metadatas, page_contents): client.upsert( collection_name=collection_name, points=[ models.PointStruct( id=uuid.uuid4().hex, payload={"metadata":metadatas[idx], "page_content": page_contents[idx], "car_id": car_id, "model_year": model_year, "create_date": datetime.datetime.now().isoformat()}, vector=vector[0] ) for idx, vector in enumerate(vectors) ], )