add string sanitization
Browse files- app.py +65 -6
- requirements.txt +1 -0
app.py
CHANGED
@@ -5,6 +5,7 @@ import pdfplumber
|
|
5 |
from docx import Document
|
6 |
import subprocess
|
7 |
import os
|
|
|
8 |
|
9 |
|
10 |
def extract_text_from_pptx(file_path):
|
@@ -20,11 +21,12 @@ def extract_text_from_pptx(file_path):
|
|
20 |
|
21 |
return "\n\n".join(text_content)
|
22 |
|
|
|
23 |
def extract_text_from_ppt(file_path):
|
24 |
try:
|
25 |
# Convert PPT to PPTX using unoconv
|
26 |
-
pptx_file_path = os.path.splitext(file_path)[0] +
|
27 |
-
subprocess.run([
|
28 |
|
29 |
# Extract text from PPTX
|
30 |
presentation = Presentation(pptx_file_path)
|
@@ -45,10 +47,11 @@ def extract_text_from_ppt(file_path):
|
|
45 |
print(f"Error extracting text from PPT file: {e}")
|
46 |
return "Error extracting text from PPT file"
|
47 |
|
|
|
48 |
def extract_text_from_ppt_or_pptx(file_path):
|
49 |
-
if file_path.endswith(
|
50 |
return extract_text_from_pptx(file_path)
|
51 |
-
elif file_path.endswith(
|
52 |
return extract_text_from_ppt(file_path)
|
53 |
else:
|
54 |
return "Unsupported file type. Please provide a .ppt or .pptx file."
|
@@ -103,6 +106,37 @@ def extract_text_from_doc_or_docx(file):
|
|
103 |
return "Unsupported file type. Please upload a .doc or .docx file."
|
104 |
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
pdf_to_img = gr.Interface(
|
107 |
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
|
108 |
)
|
@@ -127,9 +161,34 @@ pptx_or_ppt_to_text = gr.Interface(
|
|
127 |
api_name="pptx_or_ppt_to_text",
|
128 |
)
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
demo = gr.TabbedInterface(
|
131 |
-
[pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text],
|
132 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
)
|
134 |
|
135 |
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
|
|
|
5 |
from docx import Document
|
6 |
import subprocess
|
7 |
import os
|
8 |
+
from typing import Optional, List
|
9 |
|
10 |
|
11 |
def extract_text_from_pptx(file_path):
|
|
|
21 |
|
22 |
return "\n\n".join(text_content)
|
23 |
|
24 |
+
|
25 |
def extract_text_from_ppt(file_path):
|
26 |
try:
|
27 |
# Convert PPT to PPTX using unoconv
|
28 |
+
pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
|
29 |
+
subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
|
30 |
|
31 |
# Extract text from PPTX
|
32 |
presentation = Presentation(pptx_file_path)
|
|
|
47 |
print(f"Error extracting text from PPT file: {e}")
|
48 |
return "Error extracting text from PPT file"
|
49 |
|
50 |
+
|
51 |
def extract_text_from_ppt_or_pptx(file_path):
|
52 |
+
if file_path.endswith(".pptx"):
|
53 |
return extract_text_from_pptx(file_path)
|
54 |
+
elif file_path.endswith(".ppt"):
|
55 |
return extract_text_from_ppt(file_path)
|
56 |
else:
|
57 |
return "Unsupported file type. Please provide a .ppt or .pptx file."
|
|
|
106 |
return "Unsupported file type. Please upload a .doc or .docx file."
|
107 |
|
108 |
|
109 |
+
def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
|
110 |
+
left = text.find("[")
|
111 |
+
right = text.rfind("]")
|
112 |
+
text = text[left : right + 1]
|
113 |
+
try:
|
114 |
+
# Safely evaluate the string to a Python object
|
115 |
+
list_of_lists = eval(text)
|
116 |
+
if isinstance(list_of_lists, list): # Ensure it's a list
|
117 |
+
out = []
|
118 |
+
try:
|
119 |
+
# parse list of lists
|
120 |
+
for front, back in list_of_lists:
|
121 |
+
out.append({"front": front, "back": back})
|
122 |
+
return out
|
123 |
+
# errors
|
124 |
+
except Exception as e:
|
125 |
+
print(e)
|
126 |
+
# return anything that was already parsed
|
127 |
+
if out != []:
|
128 |
+
return out
|
129 |
+
# original schedma is not respected
|
130 |
+
else:
|
131 |
+
return None
|
132 |
+
else:
|
133 |
+
print("The evaluated object is not a list.")
|
134 |
+
return None
|
135 |
+
except Exception as e:
|
136 |
+
print(f"Error parsing the list of lists: {e}")
|
137 |
+
return None
|
138 |
+
|
139 |
+
|
140 |
pdf_to_img = gr.Interface(
|
141 |
convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
|
142 |
)
|
|
|
161 |
api_name="pptx_or_ppt_to_text",
|
162 |
)
|
163 |
|
164 |
+
str_to_json = gr.Interface(
|
165 |
+
sanitize_list_of_lists,
|
166 |
+
gr.Text(),
|
167 |
+
gr.JSON(),
|
168 |
+
api_name="str_to_json",
|
169 |
+
examples=[
|
170 |
+
"""[
|
171 |
+
["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
|
172 |
+
["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
|
173 |
+
["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
|
174 |
+
["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
|
175 |
+
["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
|
176 |
+
["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
|
177 |
+
["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
|
178 |
+
["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
|
179 |
+
]"""
|
180 |
+
],
|
181 |
+
)
|
182 |
+
|
183 |
demo = gr.TabbedInterface(
|
184 |
+
[pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
|
185 |
+
[
|
186 |
+
"PDF to Image",
|
187 |
+
"Extract PDF Text",
|
188 |
+
"Extract DOC/DOCX Text",
|
189 |
+
"Extract PPTX/PPT Text",
|
190 |
+
"Extract Json",
|
191 |
+
],
|
192 |
)
|
193 |
|
194 |
demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
pdf2image
|
2 |
gradio
|
3 |
pdfplumber
|
|
|
1 |
+
typing
|
2 |
pdf2image
|
3 |
gradio
|
4 |
pdfplumber
|