kopeck commited on
Commit
f733ed3
1 Parent(s): 8b1166b

added progress and inf loop prevention

Browse files
Files changed (1) hide show
  1. app.py +32 -18
app.py CHANGED
@@ -13,6 +13,9 @@ import docx2txt
13
  from reportlab.lib.pagesizes import letter
14
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
15
  from reportlab.lib.styles import getSampleStyleSheet
 
 
 
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -49,7 +52,7 @@ class AdvancedDocProcessor:
49
  def extract_text_from_pdf(self, pdf_content: bytes) -> str:
50
  """Extract text from PDF using OCR."""
51
  try:
52
- images = convert_from_bytes(pdf_content)
53
  text = ""
54
  for image in images:
55
  text += pytesseract.image_to_string(image)
@@ -69,7 +72,6 @@ class AdvancedDocProcessor:
69
  def clean_and_summarize_text(self, text: str) -> str:
70
  """Clean and summarize the text using BART."""
71
  try:
72
- # Process the text in chunks
73
  chunk_size = 1024
74
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
75
  summarized_chunks = []
@@ -85,7 +87,6 @@ class AdvancedDocProcessor:
85
  def process_with_t5(self, text: str, prompt: str) -> str:
86
  """Process the text with T5 based on the given prompt."""
87
  try:
88
- # Process the text in chunks
89
  chunk_size = 512
90
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
91
  processed_chunks = []
@@ -108,7 +109,6 @@ class AdvancedDocProcessor:
108
  def extract_entities(self, text: str) -> str:
109
  """Extract named entities from the text."""
110
  try:
111
- # Process the text in chunks
112
  chunk_size = 10000
113
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
114
  all_entities = []
@@ -137,20 +137,28 @@ def create_gradio_interface():
137
  processor = AdvancedDocProcessor()
138
 
139
  def process_and_display(file, prompt, output_format):
140
- file_content = file
141
- file_type = infer_file_type(file_content)
142
- results = processor.process_document(file_content, file_type, prompt)
143
-
144
- if output_format == "txt":
145
- output_path = save_as_txt(results)
146
- elif output_format == "docx":
147
- output_path = save_as_docx(results)
148
- else: # pdf
149
- output_path = save_as_pdf(results)
150
-
151
- return (f"Cleaned and Summarized Text:\n{results['cleaned']}\n\n"
152
- f"Processed Text:\n{results['processed']}\n\n"
153
- f"Extracted Entities:\n{results['entities']}"), output_path
 
 
 
 
 
 
 
 
154
 
155
  iface = gr.Interface(
156
  fn=process_and_display,
@@ -212,3 +220,9 @@ def save_as_pdf(results: Dict[str, str]) -> str:
212
  if __name__ == "__main__":
213
  iface = create_gradio_interface()
214
  iface.launch()
 
 
 
 
 
 
 
13
  from reportlab.lib.pagesizes import letter
14
  from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
15
  from reportlab.lib.styles import getSampleStyleSheet
16
+ import time
17
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
18
+ import docx
19
 
20
  # Set up logging
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
52
  def extract_text_from_pdf(self, pdf_content: bytes) -> str:
53
  """Extract text from PDF using OCR."""
54
  try:
55
+ images = convert_from_bytes(pdf_content, timeout=60) # Add timeout
56
  text = ""
57
  for image in images:
58
  text += pytesseract.image_to_string(image)
 
72
  def clean_and_summarize_text(self, text: str) -> str:
73
  """Clean and summarize the text using BART."""
74
  try:
 
75
  chunk_size = 1024
76
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
77
  summarized_chunks = []
 
87
  def process_with_t5(self, text: str, prompt: str) -> str:
88
  """Process the text with T5 based on the given prompt."""
89
  try:
 
90
  chunk_size = 512
91
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
92
  processed_chunks = []
 
109
  def extract_entities(self, text: str) -> str:
110
  """Extract named entities from the text."""
111
  try:
 
112
  chunk_size = 10000
113
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
114
  all_entities = []
 
137
  processor = AdvancedDocProcessor()
138
 
139
  def process_and_display(file, prompt, output_format):
140
+ def processing_task():
141
+ file_content = file
142
+ file_type = infer_file_type(file_content)
143
+ results = processor.process_document(file_content, file_type, prompt)
144
+
145
+ if output_format == "txt":
146
+ output_path = save_as_txt(results)
147
+ elif output_format == "docx":
148
+ output_path = save_as_docx(results)
149
+ else: # pdf
150
+ output_path = save_as_pdf(results)
151
+
152
+ return (f"Cleaned and Summarized Text:\n{results['cleaned']}\n\n"
153
+ f"Processed Text:\n{results['processed']}\n\n"
154
+ f"Extracted Entities:\n{results['entities']}"), output_path
155
+
156
+ with ThreadPoolExecutor() as executor:
157
+ future = executor.submit(processing_task)
158
+ try:
159
+ return future.result(timeout=300) # 5 minutes timeout
160
+ except TimeoutError:
161
+ return "Processing timed out after 5 minutes.", None
162
 
163
  iface = gr.Interface(
164
  fn=process_and_display,
 
220
  if __name__ == "__main__":
221
  iface = create_gradio_interface()
222
  iface.launch()
223
+
224
+
225
+ # Launch the Gradio app
226
+ if __name__ == "__main__":
227
+ iface = create_gradio_interface()
228
+ iface.launch()