import os import gradio as gr from transformers import ViTFeatureExtractor, ViTModel from PIL import Image from transformers import AutoTokenizer, AutoModel import torch from pdf2image import convert_from_path import io from io import BytesIO # CSS styles css = """ .button { padding: 10px 20px; background: #007BFF; color: white; border: none; cursor: pointer; font-size: 16px; margin: 10px; } """ # Define layout with custom styles layout = [ gr.Row([gr.File(label="Upload PDF", type="binary")]), # Corrected 'type' parameter gr.Row([gr.Button("Generate Insights")]), gr.Row([gr.Textbox("Placeholder for PDF insights", label="Insights", type="text")]) ] # Function to get image embeddings using ViT def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'): feature_extractor = ViTFeatureExtractor.from_pretrained(model_name) model = ViTModel.from_pretrained(model_name) image = Image.open(image_path) inputs = feature_extractor(images=image, return_tensors="pt") outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling return embeddings # Function to convert PDF to images def pdf_to_images(pdf_file, img_dir): images = convert_from_path(pdf_file) # Create the directory if it doesn't exist os.makedirs(img_dir, exist_ok=True) for i, image in enumerate(images): image_path = f"{img_dir}/page_{i + 1}.png" image.save(image_path, "PNG") print(f"Converted {len(images)} pages to images and saved in {img_dir}") # Function to get text embeddings using a transformer model def get_text_embeddings(text, model_name='bert-base-uncased'): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512) outputs = model(**inputs) embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling return embeddings # Function to process PDF and generate a response def process_pdf_and_generate_response(pdf_file): try: # Convert the binary stream to a file-like object pdf_file_stream = BytesIO(pdf_file) # Convert PDF to images img_dir = "pdf_images" pdf_to_images(pdf_file_stream, img_dir) # Generate embeddings for each image image_embeddings = [] for filename in os.listdir(img_dir): if filename.endswith(".png"): image_path = os.path.join(img_dir, filename) image_embeddings.append(get_image_embeddings(image_path)) # Perform some text analysis on the PDF content (replace with your logic) pdf_text = "PDF content analysis placeholder" text_embeddings = get_text_embeddings(pdf_text) # Combine image and text embeddings and generate a response (replace with your logic) combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0) response = "Response based on the processed PDF" except Exception as e: response = f"An error occurred: {str(e)}" return response iface = gr.Interface( fn=process_pdf_and_generate_response, inputs=gr.File(label="Upload PDF", type="binary"), # Corrected 'type' parameter outputs=gr.Textbox("Placeholder for PDF insights", label="Insights", type="text"), title="pdf-chatbot", description="Upload a PDF and receive insights based on its content.", css=css ) if __name__ == "__main__": iface.launch()