Spaces:

not-lain
/

utils

Running

File size: 5,993 Bytes

from pptx import Presentation
import gradio as gr
from pdf2image import convert_from_path
import pdfplumber
from docx import Document
import subprocess
import os
from typing import Optional, List


def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text_content = []

    for slide in prs.slides:
        slide_text = []
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                slide_text.append(shape.text)
        text_content.append("\n".join(slide_text))

    return "\n\n".join(text_content)


def extract_text_from_ppt(file_path):
    try:
        # Convert PPT to PPTX using unoconv
        pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
        subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)

        # Extract text from PPTX
        presentation = Presentation(pptx_file_path)
        text_content = []

        for slide in presentation.slides:
            slide_text = []
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    slide_text.append(shape.text)
            text_content.append("\n".join(slide_text))

        # Remove the converted PPTX file
        os.remove(pptx_file_path)

        return "\n\n".join(text_content)
    except Exception as e:
        print(f"Error extracting text from PPT file: {e}")
        return "Error extracting text from PPT file"


def extract_text_from_ppt_or_pptx(file_path):
    if file_path.endswith(".pptx"):
        return extract_text_from_pptx(file_path)
    elif file_path.endswith(".ppt"):
        return extract_text_from_ppt(file_path)
    else:
        return "Unsupported file type. Please provide a .ppt or .pptx file."


def convert_pdf_to_image(file):
    images = convert_from_path(file)
    return images


def extract_text_from_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text


def extract_text_from_docx(file):
    text = ""
    doc = Document(file.name)
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text


def convert_doc_to_text(doc_path):
    try:
        subprocess.run(
            ["unoconv", "--format", "txt", doc_path],
            capture_output=True,
            text=True,
            check=True,
        )
        txt_file_path = doc_path.replace(".doc", ".txt")
        with open(txt_file_path, "r") as f:
            text = f.read()
        text = text.lstrip("\ufeff")
        os.remove(txt_file_path)
        return text
    except subprocess.CalledProcessError as e:
        print(f"Error converting {doc_path} to text: {e}")
        return ""


def extract_text_from_doc_or_docx(file):
    if file.name.endswith(".docx"):
        return extract_text_from_docx(file)
    elif file.name.endswith(".doc"):
        return convert_doc_to_text(file.name)
    else:
        return "Unsupported file type. Please upload a .doc or .docx file."


def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
    left = text.find("[")
    right = text.rfind("]")
    text = text[left : right + 1]
    try:
        # Safely evaluate the string to a Python object
        list_of_lists = eval(text)
        if isinstance(list_of_lists, list):  # Ensure it's a list
            out = []
            try:
                # parse list of lists
                for front, back in list_of_lists:
                    out.append({"front": front, "back": back})
                return out
            # errors
            except Exception as e:
                print(e)
                # return anything that was already parsed
                if out != []:
                    return out
                # original schedma is not respected
                else:
                    return None
        else:
            print("The evaluated object is not a list.")
            return None
    except Exception as e:
        print(f"Error parsing the list of lists: {e}")
        return None


pdf_to_img = gr.Interface(
    convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
)
pdf_to_text = gr.Interface(
    extract_text_from_pdf,
    gr.File(),
    gr.Textbox(placeholder="Extracted text will appear here"),
    api_name="pdf_to_text",
)

doc_or_docx_to_text = gr.Interface(
    extract_text_from_doc_or_docx,
    gr.File(),
    gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"),
    api_name="doc_or_docx_to_text",
)

pptx_or_ppt_to_text = gr.Interface(
    extract_text_from_ppt_or_pptx,
    gr.File(),
    gr.Textbox(placeholder="Extracted text from PPTX will appear here"),
    api_name="pptx_or_ppt_to_text",
)

str_to_json = gr.Interface(
    sanitize_list_of_lists,
    gr.Text(),
    gr.JSON(),
    api_name="str_to_json",
    examples=[
        """[
  ["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
  ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
  ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
  ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
  ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
  ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
  ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
  ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
]"""
    ],
)

demo = gr.TabbedInterface(
    [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
    [
        "PDF to Image",
        "Extract PDF Text",
        "Extract DOC/DOCX Text",
        "Extract PPTX/PPT Text",
        "Extract Json",
    ],
)

demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)