import gradio as gr import pandas as pd import lancedb from lancedb.embeddings import get_registry from lancedb.pydantic import LanceModel, Vector from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter #Persistent storage is mounted to /data DB_URL = "./data/lancedb" TABLE_NAME = "pdf_table" # define schema for table with embedding api model = get_registry().get("colbert").create(name="colbert-ir/colbertv2.0") class TextModel(LanceModel): text: str = model.SourceField() vector: Vector(model.ndims()) = model.VectorField() # add in vector db def lanceDBConnection(df): """ LanceDB insertion """ db = lancedb.connect(DB_URL) table = db.create_table( TABLE_NAME, schema=TextModel, mode="overwrite", ) table.add(df) return table def get_pdf(file): try: # Access the file path file_path = file.name if isinstance(file, dict) else file # Load the PDF using PyPDFLoader loader = PyPDFLoader(file_path) documents = loader.load() except Exception as e: return f"An error occurred:{e}" text_splitter=RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=50) docs_sp=text_splitter.split_documents(documents) texts = [chunk.page_content for chunk in docs_sp] df = pd.DataFrame({"text": texts}) table = lanceDBConnection(df) return f"PDF uploaded successfully. Total number of documents: {len(df)}" def get_nearest_neighbours(query): db = lancedb.connect(DB_URL) table = db.open_table(TABLE_NAME) result = table.search(query).limit(3).to_list() context = [r["text"] for r in result] return context pdf_interface=gr.Interface( fn=get_pdf, inputs=[gr.File(label="Upload the PDF", file_types=[".pdf"])], outputs=[gr.Textbox(label="Status",lines=4)] ) question_interface=gr.Interface( fn=get_nearest_neighbours, inputs=[gr.Textbox(label="Enter your question")], outputs=["text"]*3 ) demo=gr.TabbedInterface( interface_list=[pdf_interface, question_interface], tab_names=["Upload Pdfs","Get relevant chunks"], title="Save PDF chunks into LanceDB on persitent storage") demo.launch(share=True)