Spaces:
Sleeping
Sleeping
import base64 | |
import pdfplumber | |
from transformers import pipeline | |
# Function to extract text from a PDF and summarize it | |
def get_pdf_text(pdf_file): | |
text = "" | |
# Open the PDF file and extract text | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() # Extract text from each page | |
return text | |
def display_pdf(file_path): | |
# Read the PDF file | |
with open(file_path, "rb") as f: | |
data = f.read() | |
# Convert PDF content to base64 | |
base64_pdf = base64.b64encode(data).decode("utf-8") | |
# Create an iframe to display the PDF | |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>' | |
return pdf_display | |
def split_text(text, max_length): | |
"""Split text into smaller chunks based on a specified length.""" | |
words = text.split() | |
chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)] | |
return chunks | |
def summarize(text,max_length): | |
summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn') | |
text_chunks = split_text(text, max_length=max_length) # Split into chunks of 500 words | |
# Summarize each chunk and combine the results | |
summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks] | |
# Combine the summaries into a final summary | |
final_summary = ' '.join(summaries) | |
return final_summary | |
# return text_chunks[0] |