Spaces:

not-lain
/

utils

Running

App Files Files Community

utils / app.py

not-lain

add string sanitization

59e60e9 7 days ago

raw

history blame

5.99 kB

	from pptx import Presentation
	import gradio as gr
	from pdf2image import convert_from_path
	import pdfplumber
	from docx import Document
	import subprocess
	import os
	from typing import Optional, List


	def extract_text_from_pptx(file_path):
	prs = Presentation(file_path)
	text_content = []

	for slide in prs.slides:
	slide_text = []
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	slide_text.append(shape.text)
	text_content.append("\n".join(slide_text))

	return "\n\n".join(text_content)


	def extract_text_from_ppt(file_path):
	try:
	# Convert PPT to PPTX using unoconv
	pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
	subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)

	# Extract text from PPTX
	presentation = Presentation(pptx_file_path)
	text_content = []

	for slide in presentation.slides:
	slide_text = []
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	slide_text.append(shape.text)
	text_content.append("\n".join(slide_text))

	# Remove the converted PPTX file
	os.remove(pptx_file_path)

	return "\n\n".join(text_content)
	except Exception as e:
	print(f"Error extracting text from PPT file: {e}")
	return "Error extracting text from PPT file"


	def extract_text_from_ppt_or_pptx(file_path):
	if file_path.endswith(".pptx"):
	return extract_text_from_pptx(file_path)
	elif file_path.endswith(".ppt"):
	return extract_text_from_ppt(file_path)
	else:
	return "Unsupported file type. Please provide a .ppt or .pptx file."


	def convert_pdf_to_image(file):
	images = convert_from_path(file)
	return images


	def extract_text_from_pdf(file):
	text = ""
	with pdfplumber.open(file) as pdf:
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text


	def extract_text_from_docx(file):
	text = ""
	doc = Document(file.name)
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text


	def convert_doc_to_text(doc_path):
	try:
	subprocess.run(
	["unoconv", "--format", "txt", doc_path],
	capture_output=True,
	text=True,
	check=True,
	)
	txt_file_path = doc_path.replace(".doc", ".txt")
	with open(txt_file_path, "r") as f:
	text = f.read()
	text = text.lstrip("\ufeff")
	os.remove(txt_file_path)
	return text
	except subprocess.CalledProcessError as e:
	print(f"Error converting {doc_path} to text: {e}")
	return ""


	def extract_text_from_doc_or_docx(file):
	if file.name.endswith(".docx"):
	return extract_text_from_docx(file)
	elif file.name.endswith(".doc"):
	return convert_doc_to_text(file.name)
	else:
	return "Unsupported file type. Please upload a .doc or .docx file."


	def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
	left = text.find("[")
	right = text.rfind("]")
	text = text[left : right + 1]
	try:
	# Safely evaluate the string to a Python object
	list_of_lists = eval(text)
	if isinstance(list_of_lists, list): # Ensure it's a list
	out = []
	try:
	# parse list of lists
	for front, back in list_of_lists:
	out.append({"front": front, "back": back})
	return out
	# errors
	except Exception as e:
	print(e)
	# return anything that was already parsed
	if out != []:
	return out
	# original schedma is not respected
	else:
	return None
	else:
	print("The evaluated object is not a list.")
	return None
	except Exception as e:
	print(f"Error parsing the list of lists: {e}")
	return None


	pdf_to_img = gr.Interface(
	convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
	)
	pdf_to_text = gr.Interface(
	extract_text_from_pdf,
	gr.File(),
	gr.Textbox(placeholder="Extracted text will appear here"),
	api_name="pdf_to_text",
	)

	doc_or_docx_to_text = gr.Interface(
	extract_text_from_doc_or_docx,
	gr.File(),
	gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
	api_name="doc_or_docx_to_text",
	)

	pptx_or_ppt_to_text = gr.Interface(
	extract_text_from_ppt_or_pptx,
	gr.File(),
	gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
	api_name="pptx_or_ppt_to_text",
	)

	str_to_json = gr.Interface(
	sanitize_list_of_lists,
	gr.Text(),
	gr.JSON(),
	api_name="str_to_json",
	examples=[
	"""[
	["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
	["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
	["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
	["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
	["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
	["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
	["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
	["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
	]"""
	],
	)

	demo = gr.TabbedInterface(
	[pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
	[
	"PDF to Image",
	"Extract PDF Text",
	"Extract DOC/DOCX Text",
	"Extract PPTX/PPT Text",
	"Extract Json",
	],
	)

	demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)