import streamlit as st import pandas as pd import pdfplumber import torch import faiss import numpy as np from transformers import pipeline from sentence_transformers import SentenceTransformer # Load the Sentence Transformer model for embeddings @st.cache_resource def load_embedder(): return SentenceTransformer('all-MiniLM-L6-v2') embedder = load_embedder() # Load a generative model for answer generation @st.cache_resource def load_generator(): return pipeline('text-generation', model='gpt2', tokenizer='gpt2', device=0 if torch.cuda.is_available() else -1) generator = load_generator() # Function to extract text from PDF def extract_text_from_pdf(pdf_file): text = "" with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text # Function to split text into chunks def split_text(text, chunk_size=500): sentences = text.split('. ') chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk) + len(sentence) <= chunk_size: current_chunk += sentence + ". " else: chunks.append(current_chunk.strip()) current_chunk = sentence + ". " if current_chunk: chunks.append(current_chunk.strip()) return chunks # Function to build FAISS index def build_faiss_index(chunks): embeddings = embedder.encode(chunks) embeddings = np.array(embeddings).astype('float32') index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) return index, embeddings # Streamlit app st.title("PDF and CSV Chatbot with RAG") # Upload CSV file csv_file = st.file_uploader("Upload a CSV file", type=["csv"]) csv_text = "" if csv_file: csv_data = pd.read_csv(csv_file) st.write("### CSV Data:") st.write(csv_data) csv_text = csv_data.to_csv(index=False) # Upload PDF file pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"]) pdf_text = "" if pdf_file: pdf_text = extract_text_from_pdf(pdf_file) if pdf_text.strip(): st.write("### PDF Text:") st.write(pdf_text) else: st.warning("No extractable text found in the PDF.") # Combine texts combined_text = csv_text + "\n" + pdf_text if combined_text.strip(): # Split text into chunks chunks = split_text(combined_text) # Build FAISS index index, embeddings = build_faiss_index(chunks) # Prepare for user input user_input = st.text_input("Ask a question about the uploaded data:") if st.button("Get Response"): if user_input.strip(): # Get embedding of user question question_embedding = embedder.encode([user_input]) question_embedding = np.array(question_embedding).astype('float32') # Search FAISS index k = 3 # number of nearest neighbors distances, indices = index.search(question_embedding, k) # Retrieve the most relevant chunks retrieved_chunks = [chunks[idx] for idx in indices[0]] # Combine retrieved chunks context = " ".join(retrieved_chunks) # Generate answer prompt = context + "\n\nQuestion: " + user_input + "\nAnswer:" response = generator(prompt, max_length=200, num_return_sequences=1) # Display response st.write("### Response:") st.write(response[0]['generated_text'].split("Answer:")[1].strip()) else: st.warning("Please enter a question.") else: st.info("Please upload a CSV file or a PDF file to proceed.")