Spaces:

Mohamed-BC
/

DocuBot

Sleeping

DocuBot / utilities.py

docubot-v1.5.3

20b1f3c 6 months ago

1.43 kB

	import base64
	import pdfplumber
	from transformers import pipeline
	# Function to extract text from a PDF and summarize it
	def get_pdf_text(pdf_file):
	text = ""
	# Open the PDF file and extract text
	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	text += page.extract_text() # Extract text from each page
	return text

	def display_pdf(file_path):
	# Read the PDF file
	with open(file_path, "rb") as f:
	data = f.read()
	# Convert PDF content to base64
	base64_pdf = base64.b64encode(data).decode("utf-8")
	# Create an iframe to display the PDF
	pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
	return pdf_display

	def split_text(text, max_length):
	"""Split text into smaller chunks based on a specified length."""
	words = text.split()
	chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
	return chunks

	def summarize(text,max_length):
	summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
	text_chunks = split_text(text, max_length=max_length) # Split into chunks of 500 words
	# Summarize each chunk and combine the results
	summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
	# Combine the summaries into a final summary
	final_summary = ' '.join(summaries)
	return final_summary
	# return text_chunks[0]