Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gradio as gr | |
import pandas as pd | |
import lancedb | |
from lancedb.embeddings import get_registry | |
from lancedb.pydantic import LanceModel, Vector | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
#Persistent storage is mounted to /data | |
DB_URL = "./data/lancedb" | |
TABLE_NAME = "pdf_table" | |
# define schema for table with embedding api | |
model = get_registry().get("colbert").create(name="colbert-ir/colbertv2.0") | |
class TextModel(LanceModel): | |
text: str = model.SourceField() | |
vector: Vector(model.ndims()) = model.VectorField() | |
# add in vector db | |
def lanceDBConnection(df): | |
""" | |
LanceDB insertion | |
""" | |
db = lancedb.connect(DB_URL) | |
table = db.create_table( | |
TABLE_NAME, | |
schema=TextModel, | |
mode="overwrite", | |
) | |
table.add(df) | |
return table | |
def get_pdf(file): | |
try: | |
# Access the file path | |
file_path = file.name if isinstance(file, dict) else file | |
# Load the PDF using PyPDFLoader | |
loader = PyPDFLoader(file_path) | |
documents = loader.load() | |
except Exception as e: | |
return f"An error occurred:{e}" | |
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=50) | |
docs_sp=text_splitter.split_documents(documents) | |
texts = [chunk.page_content for chunk in docs_sp] | |
df = pd.DataFrame({"text": texts}) | |
table = lanceDBConnection(df) | |
return f"PDF uploaded successfully. Total number of documents: {len(df)}" | |
def get_nearest_neighbours(query): | |
db = lancedb.connect(DB_URL) | |
table = db.open_table(TABLE_NAME) | |
result = table.search(query).limit(3).to_list() | |
context = [r["text"] for r in result] | |
return context | |
pdf_interface=gr.Interface( | |
fn=get_pdf, | |
inputs=[gr.File(label="Upload the PDF", file_types=[".pdf"])], | |
outputs=[gr.Textbox(label="Status",lines=4)] | |
) | |
question_interface=gr.Interface( | |
fn=get_nearest_neighbours, | |
inputs=[gr.Textbox(label="Enter your question")], | |
outputs=["text"]*3 | |
) | |
demo=gr.TabbedInterface( | |
interface_list=[pdf_interface, question_interface], | |
tab_names=["Upload Pdfs","Get relevant chunks"], | |
title="Save PDF chunks into LanceDB on persitent storage") | |
demo.launch(share=True) |