Spaces:

shimer56
/

Extract_PDF

Sleeping

Upload folder using huggingface_hub

d2cb17f verified 5 months ago

1.7 kB

	import gradio as gr
	from google.cloud import vision
	from PIL import Image
	import pandas as pd
	import os
	from io import BytesIO


	def extract_tables_with_google_vision(image_file):
	# Initialize Google Cloud Vision client
	client = vision.ImageAnnotatorClient()

	# Read the image file
	with BytesIO(image_file) as image_stream:
	image = Image.open(image_stream)
	# Convert image to bytes
	img_bytes = image_stream.getvalue()

	# Perform text detection on the image
	image = vision.Image(content=img_bytes)
	response = client.text_detection(image=image)
	texts = response.text_annotations

	# Extract text lines
	lines = [text.description for text in texts]

	# Check if lines resemble a table (e.g., have commas)
	tables = []
	is_table = False
	table_rows = []
	for line in lines:
	if "," in line: # Assuming comma-separated values indicate a table
	is_table = True
	table_rows.append([cell.strip() for cell in line.split(",")])
	else:
	if is_table:
	tables.extend(table_rows)
	is_table = False
	table_rows = []

	table_content = ""
	if tables:
	df = pd.DataFrame(tables[1:], columns=tables[0])
	table_content = df.to_csv(index=False)

	return table_content


	interface = gr.Interface(
	fn=extract_tables_with_google_vision,
	inputs=gr.Image(type="pil", label="Upload a PDF page image"),
	outputs=gr.Textbox(label="Extracted Tables"),
	title="PDF Table Extractor with Google Cloud Vision",
	description="Upload an image of a PDF page to extract tables.",
	allow_flagging=False
	)

	interface.launch()