izhan001's picture
Update app.py
781ba9f verified
raw
history blame
5.94 kB
import gradio as gr
import docx
import PyPDF2
from pptx import Presentation
from transformers import pipeline
from docx import Document
from io import BytesIO
import tempfile
# Initialize Hugging Face models for summarization, rephrasing, and sentiment analysis
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
rephraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", max_length=512, truncation=True)
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
# Function to read content from different file types
def read_file(file, file_type):
content = ""
try:
if file_type == "docx":
doc = Document(file)
for para in doc.paragraphs:
content += para.text + "\n"
elif file_type == "txt":
content = file.read().decode("utf-8")
elif file_type == "pdf":
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
content += page.extract_text() + "\n"
elif file_type == "pptx":
prs = Presentation(file)
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
content += shape.text + "\n"
except Exception as e:
content = f"Error reading the file: {str(e)}"
return content
# Function to process the file and generate outputs
def process_file(file, file_type, language="en"):
content = read_file(file, file_type)
# Check if content is not empty
if not content.strip() or "Error" in content:
return "Error: The document is empty or unsupported format.", None, None, None, None, None
# Summarize the content
try:
summary = summarizer(content, max_length=150, min_length=50, do_sample=False)
summary_text = summary[0]['summary_text']
except Exception as e:
summary_text = f"Summary Error: {str(e)}"
# Rephrase the entire content in manageable chunks
rephrased_text = ""
try:
chunk_size = 500
content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
for chunk in content_chunks:
rephrased = rephraser(chunk)
rephrased_text += rephrased[0]['generated_text'] + " "
except Exception as e:
rephrased_text = f"Rephrase Error: {str(e)}"
# Sentiment analysis
try:
sentiment = sentiment_analyzer(content[:512])
sentiment_text = sentiment[0]['label']
except Exception as e:
sentiment_text = f"Sentiment Analysis Error: {str(e)}"
# Extract keywords (for simplicity, extracting words here, but you can replace this with a better method)
keywords = ' '.join([word for word in content.split()[:10]])
# Saving processed file (for download link)
try:
with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file:
temp_file.write(content.encode('utf-8'))
processed_file_path = temp_file.name
except Exception as e:
processed_file_path = f"Error saving processed document: {str(e)}"
return content, rephrased_text.strip(), summary_text, sentiment_text, keywords, processed_file_path
# Define the functions for the different pages
def home_page():
with gr.Blocks() as home:
# Header
gr.Markdown("## Upload a Document to Process")
# Menu bar as buttons
with gr.Row():
home_btn = gr.Button("Home")
full_analysis_btn = gr.Button("Full Analysis", variant="primary")
# Display content on home page
gr.Markdown("Welcome to the Document Processor!")
gr.Markdown("Upload your document here and click to view details on the 'Full Analysis' page.")
# File upload and content output
file_input = gr.File(label="Upload Document")
content_output = gr.Textbox(label="Original Content")
rephrased_output = gr.Textbox(label="Rephrased Content")
def on_file_upload(file):
if not file:
return "No file uploaded.", None
content, rephrased, _, _, _, _ = process_file(file, file_type="docx")
return content, rephrased
# Process file on upload
file_input.change(on_file_upload, inputs=file_input, outputs=[content_output, rephrased_output])
return home
def detailed_page():
with gr.Blocks() as detailed:
# Header
gr.Markdown("## Detailed Analysis Page")
# Menu bar as buttons
with gr.Row():
home_btn = gr.Button("Home", variant="primary")
full_analysis_btn = gr.Button("Full Analysis")
# File upload and processing components
file_input = gr.File(label="Upload Document")
file_type = gr.Dropdown(["pdf", "docx", "txt", "pptx"], label="File Type")
keywords_output = gr.Textbox(label="Keywords")
sentiment_output = gr.Textbox(label="Sentiment Analysis")
download_link = gr.File(label="Download Processed Document")
def on_file_upload(file, file_type):
if not file:
return "No file uploaded.", None, None, None
_, _, _, sentiment, keywords, download_path = process_file(file, file_type)
return keywords, sentiment, download_path
# Process file on upload
file_input.change(on_file_upload, inputs=[file_input, file_type], outputs=[keywords_output, sentiment_output, download_link])
# Sample output or content for the detailed analysis page
gr.Markdown("Here you will see detailed analysis outputs after document upload.")
return detailed
# Main application interface with tabbed navigation
iface = gr.TabbedInterface([home_page(), detailed_page()], ["Home", "Full Analysis"])
iface.launch()