added progress and inf loop prevention
Browse files
app.py
CHANGED
@@ -13,6 +13,9 @@ import docx2txt
|
|
13 |
from reportlab.lib.pagesizes import letter
|
14 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
15 |
from reportlab.lib.styles import getSampleStyleSheet
|
|
|
|
|
|
|
16 |
|
17 |
# Set up logging
|
18 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -49,7 +52,7 @@ class AdvancedDocProcessor:
|
|
49 |
def extract_text_from_pdf(self, pdf_content: bytes) -> str:
|
50 |
"""Extract text from PDF using OCR."""
|
51 |
try:
|
52 |
-
images = convert_from_bytes(pdf_content)
|
53 |
text = ""
|
54 |
for image in images:
|
55 |
text += pytesseract.image_to_string(image)
|
@@ -69,7 +72,6 @@ class AdvancedDocProcessor:
|
|
69 |
def clean_and_summarize_text(self, text: str) -> str:
|
70 |
"""Clean and summarize the text using BART."""
|
71 |
try:
|
72 |
-
# Process the text in chunks
|
73 |
chunk_size = 1024
|
74 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
75 |
summarized_chunks = []
|
@@ -85,7 +87,6 @@ class AdvancedDocProcessor:
|
|
85 |
def process_with_t5(self, text: str, prompt: str) -> str:
|
86 |
"""Process the text with T5 based on the given prompt."""
|
87 |
try:
|
88 |
-
# Process the text in chunks
|
89 |
chunk_size = 512
|
90 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
91 |
processed_chunks = []
|
@@ -108,7 +109,6 @@ class AdvancedDocProcessor:
|
|
108 |
def extract_entities(self, text: str) -> str:
|
109 |
"""Extract named entities from the text."""
|
110 |
try:
|
111 |
-
# Process the text in chunks
|
112 |
chunk_size = 10000
|
113 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
114 |
all_entities = []
|
@@ -137,20 +137,28 @@ def create_gradio_interface():
|
|
137 |
processor = AdvancedDocProcessor()
|
138 |
|
139 |
def process_and_display(file, prompt, output_format):
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
iface = gr.Interface(
|
156 |
fn=process_and_display,
|
@@ -212,3 +220,9 @@ def save_as_pdf(results: Dict[str, str]) -> str:
|
|
212 |
if __name__ == "__main__":
|
213 |
iface = create_gradio_interface()
|
214 |
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from reportlab.lib.pagesizes import letter
|
14 |
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
15 |
from reportlab.lib.styles import getSampleStyleSheet
|
16 |
+
import time
|
17 |
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError
|
18 |
+
import docx
|
19 |
|
20 |
# Set up logging
|
21 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
52 |
def extract_text_from_pdf(self, pdf_content: bytes) -> str:
|
53 |
"""Extract text from PDF using OCR."""
|
54 |
try:
|
55 |
+
images = convert_from_bytes(pdf_content, timeout=60) # Add timeout
|
56 |
text = ""
|
57 |
for image in images:
|
58 |
text += pytesseract.image_to_string(image)
|
|
|
72 |
def clean_and_summarize_text(self, text: str) -> str:
|
73 |
"""Clean and summarize the text using BART."""
|
74 |
try:
|
|
|
75 |
chunk_size = 1024
|
76 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
77 |
summarized_chunks = []
|
|
|
87 |
def process_with_t5(self, text: str, prompt: str) -> str:
|
88 |
"""Process the text with T5 based on the given prompt."""
|
89 |
try:
|
|
|
90 |
chunk_size = 512
|
91 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
92 |
processed_chunks = []
|
|
|
109 |
def extract_entities(self, text: str) -> str:
|
110 |
"""Extract named entities from the text."""
|
111 |
try:
|
|
|
112 |
chunk_size = 10000
|
113 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
114 |
all_entities = []
|
|
|
137 |
processor = AdvancedDocProcessor()
|
138 |
|
139 |
def process_and_display(file, prompt, output_format):
|
140 |
+
def processing_task():
|
141 |
+
file_content = file
|
142 |
+
file_type = infer_file_type(file_content)
|
143 |
+
results = processor.process_document(file_content, file_type, prompt)
|
144 |
+
|
145 |
+
if output_format == "txt":
|
146 |
+
output_path = save_as_txt(results)
|
147 |
+
elif output_format == "docx":
|
148 |
+
output_path = save_as_docx(results)
|
149 |
+
else: # pdf
|
150 |
+
output_path = save_as_pdf(results)
|
151 |
+
|
152 |
+
return (f"Cleaned and Summarized Text:\n{results['cleaned']}\n\n"
|
153 |
+
f"Processed Text:\n{results['processed']}\n\n"
|
154 |
+
f"Extracted Entities:\n{results['entities']}"), output_path
|
155 |
+
|
156 |
+
with ThreadPoolExecutor() as executor:
|
157 |
+
future = executor.submit(processing_task)
|
158 |
+
try:
|
159 |
+
return future.result(timeout=300) # 5 minutes timeout
|
160 |
+
except TimeoutError:
|
161 |
+
return "Processing timed out after 5 minutes.", None
|
162 |
|
163 |
iface = gr.Interface(
|
164 |
fn=process_and_display,
|
|
|
220 |
if __name__ == "__main__":
|
221 |
iface = create_gradio_interface()
|
222 |
iface.launch()
|
223 |
+
|
224 |
+
|
225 |
+
# Launch the Gradio app
|
226 |
+
if __name__ == "__main__":
|
227 |
+
iface = create_gradio_interface()
|
228 |
+
iface.launch()
|