not-lain commited on
Commit
59e60e9
1 Parent(s): 579432a

add string sanitization

Browse files
Files changed (2) hide show
  1. app.py +65 -6
  2. requirements.txt +1 -0
app.py CHANGED
@@ -5,6 +5,7 @@ import pdfplumber
5
  from docx import Document
6
  import subprocess
7
  import os
 
8
 
9
 
10
  def extract_text_from_pptx(file_path):
@@ -20,11 +21,12 @@ def extract_text_from_pptx(file_path):
20
 
21
  return "\n\n".join(text_content)
22
 
 
23
  def extract_text_from_ppt(file_path):
24
  try:
25
  # Convert PPT to PPTX using unoconv
26
- pptx_file_path = os.path.splitext(file_path)[0] + '.pptx'
27
- subprocess.run(['unoconv', '-f', 'pptx', file_path], check=True)
28
 
29
  # Extract text from PPTX
30
  presentation = Presentation(pptx_file_path)
@@ -45,10 +47,11 @@ def extract_text_from_ppt(file_path):
45
  print(f"Error extracting text from PPT file: {e}")
46
  return "Error extracting text from PPT file"
47
 
 
48
  def extract_text_from_ppt_or_pptx(file_path):
49
- if file_path.endswith('.pptx'):
50
  return extract_text_from_pptx(file_path)
51
- elif file_path.endswith('.ppt'):
52
  return extract_text_from_ppt(file_path)
53
  else:
54
  return "Unsupported file type. Please provide a .ppt or .pptx file."
@@ -103,6 +106,37 @@ def extract_text_from_doc_or_docx(file):
103
  return "Unsupported file type. Please upload a .doc or .docx file."
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  pdf_to_img = gr.Interface(
107
  convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
108
  )
@@ -127,9 +161,34 @@ pptx_or_ppt_to_text = gr.Interface(
127
  api_name="pptx_or_ppt_to_text",
128
  )
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  demo = gr.TabbedInterface(
131
- [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text],
132
- ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text", "Extract PPTX/PPT Text"],
 
 
 
 
 
 
133
  )
134
 
135
  demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
 
5
  from docx import Document
6
  import subprocess
7
  import os
8
+ from typing import Optional, List
9
 
10
 
11
  def extract_text_from_pptx(file_path):
 
21
 
22
  return "\n\n".join(text_content)
23
 
24
+
25
  def extract_text_from_ppt(file_path):
26
  try:
27
  # Convert PPT to PPTX using unoconv
28
+ pptx_file_path = os.path.splitext(file_path)[0] + ".pptx"
29
+ subprocess.run(["unoconv", "-f", "pptx", file_path], check=True)
30
 
31
  # Extract text from PPTX
32
  presentation = Presentation(pptx_file_path)
 
47
  print(f"Error extracting text from PPT file: {e}")
48
  return "Error extracting text from PPT file"
49
 
50
+
51
  def extract_text_from_ppt_or_pptx(file_path):
52
+ if file_path.endswith(".pptx"):
53
  return extract_text_from_pptx(file_path)
54
+ elif file_path.endswith(".ppt"):
55
  return extract_text_from_ppt(file_path)
56
  else:
57
  return "Unsupported file type. Please provide a .ppt or .pptx file."
 
106
  return "Unsupported file type. Please upload a .doc or .docx file."
107
 
108
 
109
+ def sanitize_list_of_lists(text: str) -> Optional[List[List]]:
110
+ left = text.find("[")
111
+ right = text.rfind("]")
112
+ text = text[left : right + 1]
113
+ try:
114
+ # Safely evaluate the string to a Python object
115
+ list_of_lists = eval(text)
116
+ if isinstance(list_of_lists, list): # Ensure it's a list
117
+ out = []
118
+ try:
119
+ # parse list of lists
120
+ for front, back in list_of_lists:
121
+ out.append({"front": front, "back": back})
122
+ return out
123
+ # errors
124
+ except Exception as e:
125
+ print(e)
126
+ # return anything that was already parsed
127
+ if out != []:
128
+ return out
129
+ # original schedma is not respected
130
+ else:
131
+ return None
132
+ else:
133
+ print("The evaluated object is not a list.")
134
+ return None
135
+ except Exception as e:
136
+ print(f"Error parsing the list of lists: {e}")
137
+ return None
138
+
139
+
140
  pdf_to_img = gr.Interface(
141
  convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img"
142
  )
 
161
  api_name="pptx_or_ppt_to_text",
162
  )
163
 
164
+ str_to_json = gr.Interface(
165
+ sanitize_list_of_lists,
166
+ gr.Text(),
167
+ gr.JSON(),
168
+ api_name="str_to_json",
169
+ examples=[
170
+ """[
171
+ ["What year was the Carthaginian Empire founded?", "Around 814 BCE"],
172
+ ["Where was the center of the Carthaginian Empire located?", "Carthage, near present-day Tunis, Tunisia"],
173
+ ["Which powerful ancient republic did Carthage have conflicts with?", "The Roman Republic"],
174
+ ["Fill in the blank: Hannibal famously crossed the ________ with war elephants.", "Alps"],
175
+ ["What were the series of conflicts between Carthage and Rome called?", "The Punic Wars"],
176
+ ["Multiple Choice: What was a significant military advantage of Carthage? A) Strong infantry, B) Powerful navy, C) Fortified cities", "B) Powerful navy"],
177
+ ["In what year was Carthage captured and destroyed by Rome?", "146 BCE"],
178
+ ["What did Carthage excel in that allowed it to amass wealth?", "Maritime trade"]
179
+ ]"""
180
+ ],
181
+ )
182
+
183
  demo = gr.TabbedInterface(
184
+ [pdf_to_img, pdf_to_text, doc_or_docx_to_text, pptx_or_ppt_to_text, str_to_json],
185
+ [
186
+ "PDF to Image",
187
+ "Extract PDF Text",
188
+ "Extract DOC/DOCX Text",
189
+ "Extract PPTX/PPT Text",
190
+ "Extract Json",
191
+ ],
192
  )
193
 
194
  demo.launch(server_name="0.0.0.0.", server_port=7860, debug=True)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  pdf2image
2
  gradio
3
  pdfplumber
 
1
+ typing
2
  pdf2image
3
  gradio
4
  pdfplumber