Spaces:
Sleeping
Sleeping
import gradio as gr | |
import docx | |
import PyPDF2 | |
from pptx import Presentation | |
from transformers import pipeline | |
from docx import Document | |
from io import BytesIO | |
import tempfile | |
# Initialize Hugging Face models for summarization, rephrasing, and sentiment analysis | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
rephraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", max_length=512, truncation=True) | |
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
# Function to read content from different file types | |
def read_file(file, file_type): | |
content = "" | |
try: | |
if file_type == "docx": | |
doc = Document(file) | |
for para in doc.paragraphs: | |
content += para.text + "\n" | |
elif file_type == "txt": | |
content = file.read().decode("utf-8") | |
elif file_type == "pdf": | |
pdf_reader = PyPDF2.PdfReader(file) | |
for page in pdf_reader.pages: | |
content += page.extract_text() + "\n" | |
elif file_type == "pptx": | |
prs = Presentation(file) | |
for slide in prs.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
content += shape.text + "\n" | |
except Exception as e: | |
content = f"Error reading the file: {str(e)}" | |
return content | |
# Function to process the file and generate outputs | |
def process_file(file, file_type, language="en"): | |
content = read_file(file, file_type) | |
# Check if content is not empty | |
if not content.strip() or "Error" in content: | |
return "Error: The document is empty or unsupported format.", None, None, None, None, None | |
# Summarize the content | |
try: | |
summary = summarizer(content, max_length=150, min_length=50, do_sample=False) | |
summary_text = summary[0]['summary_text'] | |
except Exception as e: | |
summary_text = f"Summary Error: {str(e)}" | |
# Rephrase the entire content in manageable chunks | |
rephrased_text = "" | |
try: | |
chunk_size = 500 | |
content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] | |
for chunk in content_chunks: | |
rephrased = rephraser(chunk) | |
rephrased_text += rephrased[0]['generated_text'] + " " | |
except Exception as e: | |
rephrased_text = f"Rephrase Error: {str(e)}" | |
# Sentiment analysis | |
try: | |
sentiment = sentiment_analyzer(content[:512]) | |
sentiment_text = sentiment[0]['label'] | |
except Exception as e: | |
sentiment_text = f"Sentiment Analysis Error: {str(e)}" | |
# Extract keywords (for simplicity, extracting words here, but you can replace this with a better method) | |
keywords = ' '.join([word for word in content.split()[:10]]) | |
# Saving processed file (for download link) | |
try: | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file: | |
temp_file.write(content.encode('utf-8')) | |
processed_file_path = temp_file.name | |
except Exception as e: | |
processed_file_path = f"Error saving processed document: {str(e)}" | |
return content, rephrased_text.strip(), summary_text, sentiment_text, keywords, processed_file_path | |
# Define the functions for the different pages | |
def home_page(): | |
with gr.Blocks() as home: | |
# Header | |
gr.Markdown("## Upload a Document to Process") | |
# Menu bar as buttons | |
with gr.Row(): | |
home_btn = gr.Button("Home") | |
full_analysis_btn = gr.Button("Full Analysis", variant="primary") | |
# Display content on home page | |
gr.Markdown("Welcome to the Document Processor!") | |
gr.Markdown("Upload your document here and click to view details on the 'Full Analysis' page.") | |
# File upload and content output | |
file_input = gr.File(label="Upload Document") | |
content_output = gr.Textbox(label="Original Content") | |
rephrased_output = gr.Textbox(label="Rephrased Content") | |
def on_file_upload(file): | |
if not file: | |
return "No file uploaded.", None | |
content, rephrased, _, _, _, _ = process_file(file, file_type="docx") | |
return content, rephrased | |
# Process file on upload | |
file_input.change(on_file_upload, inputs=file_input, outputs=[content_output, rephrased_output]) | |
return home | |
def detailed_page(): | |
with gr.Blocks() as detailed: | |
# Header | |
gr.Markdown("## Detailed Analysis Page") | |
# Menu bar as buttons | |
with gr.Row(): | |
home_btn = gr.Button("Home", variant="primary") | |
full_analysis_btn = gr.Button("Full Analysis") | |
# File upload and processing components | |
file_input = gr.File(label="Upload Document") | |
file_type = gr.Dropdown(["pdf", "docx", "txt", "pptx"], label="File Type") | |
keywords_output = gr.Textbox(label="Keywords") | |
sentiment_output = gr.Textbox(label="Sentiment Analysis") | |
download_link = gr.File(label="Download Processed Document") | |
def on_file_upload(file, file_type): | |
if not file: | |
return "No file uploaded.", None, None, None | |
_, _, _, sentiment, keywords, download_path = process_file(file, file_type) | |
return keywords, sentiment, download_path | |
# Process file on upload | |
file_input.change(on_file_upload, inputs=[file_input, file_type], outputs=[keywords_output, sentiment_output, download_link]) | |
# Sample output or content for the detailed analysis page | |
gr.Markdown("Here you will see detailed analysis outputs after document upload.") | |
return detailed | |
# Main application interface with tabbed navigation | |
iface = gr.TabbedInterface([home_page(), detailed_page()], ["Home", "Full Analysis"]) | |
iface.launch() | |