|
from pptx import Presentation |
|
import gradio as gr |
|
from pdf2image import convert_from_path |
|
import pdfplumber |
|
from docx import Document |
|
import subprocess |
|
import os |
|
from typing import Optional, List |
|
|
|
|
|
def extract_text_from_pptx(file_path): |
|
prs = Presentation(file_path) |
|
text_content = [] |
|
|
|
for slide in prs.slides: |
|
slide_text = [] |
|
for shape in slide.shapes: |
|
if hasattr(shape, "text"): |
|
slide_text.append(shape.text) |
|
text_content.append("\n".join(slide_text)) |
|
|
|
return "\n\n".join(text_content) |
|
|
|
|
|
def extract_text_from_ppt(file_path): |
|
try: |
|
|
|
pptx_file_path = os.path.splitext(file_path)[0] + ".pptx" |
|
subprocess.run(["unoconv", "-f", "pptx", file_path], check=True) |
|
|
|
|
|
presentation = Presentation(pptx_file_path) |
|
text_content = [] |
|
|
|
for slide in presentation.slides: |
|
slide_text = [] |
|
for shape in slide.shapes: |
|
if hasattr(shape, "text"): |
|
slide_text.append(shape.text) |
|
text_content.append("\n".join(slide_text)) |
|
|
|
|
|
os.remove(pptx_file_path) |
|
|
|
return "\n\n".join(text_content) |
|
except Exception as e: |
|
print(f"Error extracting text from PPT file: {e}") |
|
return "Error extracting text from PPT file" |
|
|
|
|
|
def extract_text_from_ppt_or_pptx(file_path): |
|
if file_path.endswith(".pptx"): |
|
return extract_text_from_pptx(file_path) |
|
elif file_path.endswith(".ppt"): |
|
return extract_text_from_ppt(file_path) |
|
else: |
|
return "Unsupported file type. Please provide a .ppt or .pptx file." |
|
|
|
|
|
def convert_pdf_to_image(file): |
|
images = convert_from_path(file) |
|
return images |
|
|
|
|
|
def extract_text_from_pdf(file): |
|
text = "" |
|
with pdfplumber.open(file) as pdf: |
|
for page in pdf.pages: |
|
text += page.extract_text() + "\n" |
|
return text |
|
|
|
|
|
def extract_text_from_docx(file): |
|
text = "" |
|
doc = Document(file.name) |
|
for paragraph in doc.paragraphs: |
|
text += paragraph.text + "\n" |
|
return text |
|
|
|
|
|
def convert_doc_to_text(doc_path): |
|
try: |
|
subprocess.run( |
|
["unoconv", "--format", "txt", doc_path], |
|
capture_output=True, |
|
text=True, |
|
check=True, |
|
) |
|
txt_file_path = doc_path.replace(".doc", ".txt") |
|
with open(txt_file_path, "r") as f: |
|
text = f.read() |
|
text = text.lstrip("\ufeff") |
|
os.remove(txt_file_path) |
|
return text |
|
except subprocess.CalledProcessError as e: |
|
print(f"Error converting {doc_path} to text: {e}") |
|
return "" |
|
|
|
|
|
def extract_text_from_doc_or_docx(file): |
|
if file.name.endswith(".docx"): |
|
return extract_text_from_docx(file) |
|
elif file.name.endswith(".doc"): |
|
return convert_doc_to_text(file.name) |
|
else: |
|
return "Unsupported file type. Please upload a .doc or .docx file." |
|
|
|
|
|
def sanitize_list_of_lists(text: str) -> Optional[List[List]]: |
|
left = text.find("[") |
|
right = text.rfind("]") |
|
text = text[left : right + 1] |
|
try: |
|
|
|
list_of_lists = eval(text) |
|
if isinstance(list_of_lists, list): |
|
out = [] |
|
try: |
|
|
|
for front, back in list_of_lists: |
|
out.append({"front": front, "back": back}) |
|
return out |
|
|
|
except Exception as e: |
|
print(e) |
|
|
|
if out != []: |
|
return out |
|
|
|
else: |
|
return None |
|
else: |
|
print("The evaluated object is not a list.") |
|
return None |
|
except Exception as e: |
|
print(f"Error parsing the list of lists: {e}") |
|
return None |
|
|
|
|
|
pdf_to_img = gr.Interface( |
|
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" |
|
) |
|
pdf_to_text = gr.Interface( |
|
extract_text_from_pdf, |
|
gr.File(), |
|
gr.Textbox(placeholder="Extracted text will appear here"), |
|
api_name="pdf_to_text", |
|
) |
|
|
|
doc_or_docx_to_text = gr.Interface( |
|
extract_text_from_doc_or_docx, |
|
gr.File(), |
|
gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"), |
|
api_name="doc_or_docx_to_text", |
|
) |
|
|
|
pptx_or_ppt_to_text = gr.Interface( |
|
extract_text_from_ppt_or_pptx, |
|
gr.File(), |
|
gr.Textbox(placeholder="Extracted text from PPTX will appear here"), |
|
api_name="pptx_or_ppt_to_text", |
|
) |
|
|
|
str_to_json = gr.Interface( |
|
sanitize_list_of_lists, |
|
gr.Text(), |
|
gr.JSON(), |
|
api_name="str_to_json", |
|
examples=[ |
|
"""[ |
|
["What year was the Carthaginian Empire founded?", "Around 814 BCE"], |
|
["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"], |
|
["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"], |
|
["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"], |
|
["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"], |
|
["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"], |
|
["In what year was Carthage captured and destroyed by Rome?", "146 BCE"], |
|
["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"] |
|
]""" |
|
], |
|
) |
|
|
|
demo = gr.TabbedInterface( |
|
[pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json], |
|
[ |
|
"PDF to Image", |
|
"Extract PDF Text", |
|
"Extract DOC/DOCX Text", |
|
"Extract PPTX/PPT Text", |
|
"Extract Json", |
|
], |
|
) |
|
|
|
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True) |
|
|