PDF-Insight / app.py
Rehan3024's picture
Update app.py
0c41870 verified
import streamlit as st
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
import fitz # PyMuPDF
import os
import torch # Import torch module
# Load the models
summarization_model_name = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(summarization_model_name)
summarization_model = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)
qa_model_name = 'deepset/bert-large-uncased-whole-word-masking-squad2'
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
# Function to extract text from a PDF file
def extract_text_from_pdf(file):
doc = fitz.open(file)
text = ""
for page in doc:
text += page.get_text()
return text
# Function to summarize document
def summarize_document(document):
inputs = tokenizer(document, return_tensors='pt', max_length=1024, truncation=True)
summary_ids = summarization_model.generate(inputs['input_ids'], max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Function to get answer to question
def get_answer(question, context):
inputs = qa_tokenizer(question, context, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = qa_model(**inputs)
start_scores = outputs.start_logits
end_scores = outputs.end_logits
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores) + 1
answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
return answer
# Streamlit app
st.title("PDF Summarizer and Q&A")
st.write("Upload a PDF file to get a summary and ask questions about the content.")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
# Save the uploaded file to a temporary location
with open("temp.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
# Extract text from the PDF
document_text = extract_text_from_pdf("temp.pdf")
# Display the extracted text
st.write("Extracted Text:")
st.write(document_text)
if st.button("Summarize"):
with st.spinner('Summarizing...'):
summary = summarize_document(document_text)
st.write("**Summary:**")
st.write(summary)
question = st.text_input("Ask a question about the document")
if st.button("Get Answer"):
if question:
with st.spinner('Generating answer...'):
answer = get_answer(question, document_text)
st.write("**Answer:**")
st.write(answer)
else:
st.write("Please enter a question.")
# Remove temporary file after use
if os.path.exists("temp.pdf"):
os.remove("temp.pdf")