Spaces:
Sleeping
Sleeping
import gradio as gr | |
from google.cloud import vision | |
from PIL import Image | |
import pandas as pd | |
import os | |
from io import BytesIO | |
def extract_tables_with_google_vision(image_file): | |
# Initialize Google Cloud Vision client | |
client = vision.ImageAnnotatorClient() | |
# Read the image file | |
with BytesIO(image_file) as image_stream: | |
image = Image.open(image_stream) | |
# Convert image to bytes | |
img_bytes = image_stream.getvalue() | |
# Perform text detection on the image | |
image = vision.Image(content=img_bytes) | |
response = client.text_detection(image=image) | |
texts = response.text_annotations | |
# Extract text lines | |
lines = [text.description for text in texts] | |
# Check if lines resemble a table (e.g., have commas) | |
tables = [] | |
is_table = False | |
table_rows = [] | |
for line in lines: | |
if "," in line: # Assuming comma-separated values indicate a table | |
is_table = True | |
table_rows.append([cell.strip() for cell in line.split(",")]) | |
else: | |
if is_table: | |
tables.extend(table_rows) | |
is_table = False | |
table_rows = [] | |
table_content = "" | |
if tables: | |
df = pd.DataFrame(tables[1:], columns=tables[0]) | |
table_content = df.to_csv(index=False) | |
return table_content | |
interface = gr.Interface( | |
fn=extract_tables_with_google_vision, | |
inputs=gr.Image(type="pil", label="Upload a PDF page image"), | |
outputs=gr.Textbox(label="Extracted Tables"), | |
title="PDF Table Extractor with Google Cloud Vision", | |
description="Upload an image of a PDF page to extract tables.", | |
allow_flagging=False | |
) | |
interface.launch() | |