DocuBot / utilities.py
Mohamed-BC's picture
docubot-v1.5.3
20b1f3c
import base64
import pdfplumber
from transformers import pipeline
# Function to extract text from a PDF and summarize it
def get_pdf_text(pdf_file):
text = ""
# Open the PDF file and extract text
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text += page.extract_text() # Extract text from each page
return text
def display_pdf(file_path):
# Read the PDF file
with open(file_path, "rb") as f:
data = f.read()
# Convert PDF content to base64
base64_pdf = base64.b64encode(data).decode("utf-8")
# Create an iframe to display the PDF
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
return pdf_display
def split_text(text, max_length):
"""Split text into smaller chunks based on a specified length."""
words = text.split()
chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
return chunks
def summarize(text,max_length):
summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
text_chunks = split_text(text, max_length=max_length) # Split into chunks of 500 words
# Summarize each chunk and combine the results
summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
# Combine the summaries into a final summary
final_summary = ' '.join(summaries)
return final_summary
# return text_chunks[0]