Spaces:

not-lain
/

utils

Running

App Files Files Community

not-lain commited on 7 days ago

Commit

59e60e9

•

1 Parent(s): 579432a

add string sanitization

Browse files

Files changed (2) hide show

app.py +65 -6
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pdfplumber
 from docx import Document
 import subprocess
 import os
 def extract_text_from_pptx(file_path):
@@ -20,11 +21,12 @@ def extract_text_from_pptx(file_path):
     return "\n\n".join(text_content)
 def extract_text_from_ppt(file_path):
     try:
         # Convert PPT to PPTX using unoconv
-        pptx_file_path = os.path.splitext(file_path)[0] + '.pptx'
-        subprocess.run(['unoconv', '-f', 'pptx', file_path], check=True)
         # Extract text from PPTX
         presentation = Presentation(pptx_file_path)
@@ -45,10 +47,11 @@ def extract_text_from_ppt(file_path):
         print(f"Error extracting text from PPT file: {e}")
         return "Error extracting text from PPT file"
 def extract_text_from_ppt_or_pptx(file_path):
-    if file_path.endswith('.pptx'):
         return extract_text_from_pptx(file_path)
-    elif file_path.endswith('.ppt'):
         return extract_text_from_ppt(file_path)
     else:
         return "Unsupported file type. Please provide a .ppt or .pptx file."
@@ -103,6 +106,37 @@ def extract_text_from_doc_or_docx(file):
         return "Unsupported file type. Please upload a .doc or .docx file."
 pdf_to_img = gr.Interface(
     convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
 )
@@ -127,9 +161,34 @@ pptx_or_ppt_to_text = gr.Interface(
     api_name="pptx_or_ppt_to_text",
 )
 demo = gr.TabbedInterface(
-    [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text],
-    ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX/PPT Text"],
 )
 demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)

 from docx import Document
 import subprocess
 import os
+from typing import Optional, List
 def extract_text_from_pptx(file_path):
     return "\n\n".join(text_content)
 def extract_text_from_ppt(file_path):
     try:
         # Convert PPT to PPTX using unoconv
+        pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
+        subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
         # Extract text from PPTX
         presentation = Presentation(pptx_file_path)
         print(f"Error extracting text from PPT file: {e}")
         return "Error extracting text from PPT file"
 def extract_text_from_ppt_or_pptx(file_path):
+    if file_path.endswith(".pptx"):
         return extract_text_from_pptx(file_path)
+    elif file_path.endswith(".ppt"):
         return extract_text_from_ppt(file_path)
     else:
         return "Unsupported file type. Please provide a .ppt or .pptx file."
         return "Unsupported file type. Please upload a .doc or .docx file."
+def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
+    left = text.find("[")
+    right = text.rfind("]")
+    text = text[left : right + 1]
+    try:
+        # Safely evaluate the string to a Python object
+        list_of_lists = eval(text)
+        if isinstance(list_of_lists, list):  # Ensure it's a list
+            out = []
+            try:
+                # parse list of lists
+                for front, back in list_of_lists:
+                    out.append({"front": front, "back": back})
+                return out
+            # errors
+            except Exception as e:
+                print(e)
+                # return anything that was already parsed
+                if out != []:
+                    return out
+                # original schedma is not respected
+                else:
+                    return None
+        else:
+            print("The evaluated object is not a list.")
+            return None
+    except Exception as e:
+        print(f"Error parsing the list of lists: {e}")
+        return None
 pdf_to_img = gr.Interface(
     convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
 )
     api_name="pptx_or_ppt_to_text",
 )
+str_to_json = gr.Interface(
+    sanitize_list_of_lists,
+    gr.Text(),
+    gr.JSON(),
+    api_name="str_to_json",
+    examples=[
+        """[
+  ["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
+  ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
+  ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
+  ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
+  ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
+  ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
+  ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
+  ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
+]"""
+    ],
+)
 demo = gr.TabbedInterface(
+    [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
+    [
+        "PDF to Image",
+        "Extract PDF Text",
+        "Extract DOC/DOCX Text",
+        "Extract PPTX/PPT Text",
+        "Extract Json",
+    ],
 )
 demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 pdf2image
 gradio
 pdfplumber

+typing
 pdf2image
 gradio
 pdfplumber