Spaces:
Running
Running
import json | |
import os | |
import uuid | |
import datetime | |
from qdrant_client import QdrantClient, models | |
from langchain_core.load import dumpd, dumps, load, loads | |
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader | |
from langchain.text_splitter import NLTKTextSplitter, RecursiveCharacterTextSplitter | |
from langchain_qdrant import Qdrant | |
class PDFLoader: | |
def __init__(self): | |
pass | |
def pdf_reader(self, path): | |
key = None # change it with your own loader key | |
endpoint = None # change it with your own loader endpoint | |
analysis_features = ["ocrHighResolution"] | |
# PDF Loader | |
AzurePDFLoader = AzureAIDocumentIntelligenceLoader( | |
api_endpoint=endpoint, | |
api_key=key, | |
file_path=path, | |
api_model="prebuilt-layout", | |
mode="page", | |
analysis_features=analysis_features | |
) | |
documents = AzurePDFLoader.load() | |
return documents | |
def save_raw_documents(self, path, name, documents): | |
log_file={"documents": dumpd(documents)} | |
log_file_name = os.path.join(path, name) | |
with open(log_file_name, 'w') as output_file: | |
print(json.dumps(log_file, indent=2), file=output_file) | |
def load_raw_documents(self, path, name): | |
log_file_name = os.path.join(path, name) | |
with open(log_file_name, 'rb') as output_file: | |
log_file= json.load(output_file) | |
documents = load(log_file["documents"]) | |
return documents | |
def recursive_splitter(self, documents, chunk_size=1024, chunk_overlap=256): | |
# Splitter | |
mySplitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
add_start_index=False) | |
chunks = mySplitter.split_documents(documents) | |
return chunks | |
def generate_vectors(self, chunks, embeddings, source_name): | |
vectors = [] | |
metadatas = [] | |
page_contents = [] | |
for chunk in chunks: | |
page_contents.append(chunk.page_content) | |
vector = embeddings.embed_documents([chunk.page_content]) | |
vectors.append(vector) | |
meta = chunk.metadata | |
meta["source"] = source_name | |
metadatas.append(meta) | |
return page_contents, vectors, metadatas | |
def save_to_database(self, chunks, embeddings, collection_name): | |
qdrant = Qdrant.from_documents( | |
chunks, | |
embeddings, | |
url=os.getenv('qdrant_url'), | |
api_key=os.getenv('qdrant_api'), | |
prefer_grpc=True, | |
collection_name=collection_name) | |
def load_from_database(self, embeddings, collection_name): | |
db = Qdrant.from_existing_collection( | |
embedding=embeddings, | |
url=os.getenv('qdrant_url'), | |
api_key=os.getenv('qdrant_api'), | |
collection_name=collection_name) | |
return db | |
def save_manuals(self, client, collection_name, car_id, model_year, vectors, metadatas, page_contents): | |
client.upsert( | |
collection_name=collection_name, | |
points=[ | |
models.PointStruct( | |
id=uuid.uuid4().hex, | |
payload={"metadata":metadatas[idx], | |
"page_content": page_contents[idx], | |
"car_id": car_id, | |
"model_year": model_year, | |
"create_date": datetime.datetime.now().isoformat()}, | |
vector=vector[0] | |
) | |
for idx, vector in enumerate(vectors) | |
], | |
) |