Spaces:

kopeck
/

ocrtest

Running

App Files Files Community

kopeck commited on 6 days ago

Commit

f733ed3

•

1 Parent(s): 8b1166b

added progress and inf loop prevention

Browse files

Files changed (1) hide show

app.py +32 -18

app.py CHANGED Viewed

@@ -13,6 +13,9 @@ import docx2txt
 from reportlab.lib.pagesizes import letter
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -49,7 +52,7 @@ class AdvancedDocProcessor:
     def extract_text_from_pdf(self, pdf_content: bytes) -> str:
         """Extract text from PDF using OCR."""
         try:
-            images = convert_from_bytes(pdf_content)
             text = ""
             for image in images:
                 text += pytesseract.image_to_string(image)
@@ -69,7 +72,6 @@ class AdvancedDocProcessor:
     def clean_and_summarize_text(self, text: str) -> str:
         """Clean and summarize the text using BART."""
         try:
-            # Process the text in chunks
             chunk_size = 1024
             chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
             summarized_chunks = []
@@ -85,7 +87,6 @@ class AdvancedDocProcessor:
     def process_with_t5(self, text: str, prompt: str) -> str:
         """Process the text with T5 based on the given prompt."""
         try:
-            # Process the text in chunks
             chunk_size = 512
             chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
             processed_chunks = []
@@ -108,7 +109,6 @@ class AdvancedDocProcessor:
     def extract_entities(self, text: str) -> str:
         """Extract named entities from the text."""
         try:
-            # Process the text in chunks
             chunk_size = 10000
             chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
             all_entities = []
@@ -137,20 +137,28 @@ def create_gradio_interface():
     processor = AdvancedDocProcessor()
     def process_and_display(file, prompt, output_format):
-        file_content = file
-        file_type = infer_file_type(file_content)
-        results = processor.process_document(file_content, file_type, prompt)
-        if output_format == "txt":
-            output_path = save_as_txt(results)
-        elif output_format == "docx":
-            output_path = save_as_docx(results)
-        else:  # pdf
-            output_path = save_as_pdf(results)
-        return (f"Cleaned and Summarized Text:\n{results['cleaned']}\n\n"
-                f"Processed Text:\n{results['processed']}\n\n"
-                f"Extracted Entities:\n{results['entities']}"), output_path
     iface = gr.Interface(
         fn=process_and_display,
@@ -212,3 +220,9 @@ def save_as_pdf(results: Dict[str, str]) -> str:
 if __name__ == "__main__":
     iface = create_gradio_interface()
     iface.launch()

 from reportlab.lib.pagesizes import letter
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet
+import time
+from concurrent.futures import ThreadPoolExecutor, TimeoutError
+import docx
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     def extract_text_from_pdf(self, pdf_content: bytes) -> str:
         """Extract text from PDF using OCR."""
         try:
+            images = convert_from_bytes(pdf_content, timeout=60)  # Add timeout
             text = ""
             for image in images:
                 text += pytesseract.image_to_string(image)
     def clean_and_summarize_text(self, text: str) -> str:
         """Clean and summarize the text using BART."""
         try:
             chunk_size = 1024
             chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
             summarized_chunks = []
     def process_with_t5(self, text: str, prompt: str) -> str:
         """Process the text with T5 based on the given prompt."""
         try:
             chunk_size = 512
             chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
             processed_chunks = []
     def extract_entities(self, text: str) -> str:
         """Extract named entities from the text."""
         try:
             chunk_size = 10000
             chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
             all_entities = []
     processor = AdvancedDocProcessor()
     def process_and_display(file, prompt, output_format):
+        def processing_task():
+            file_content = file
+            file_type = infer_file_type(file_content)
+            results = processor.process_document(file_content, file_type, prompt)
+            if output_format == "txt":
+                output_path = save_as_txt(results)
+            elif output_format == "docx":
+                output_path = save_as_docx(results)
+            else:  # pdf
+                output_path = save_as_pdf(results)
+            return (f"Cleaned and Summarized Text:\n{results['cleaned']}\n\n"
+                    f"Processed Text:\n{results['processed']}\n\n"
+                    f"Extracted Entities:\n{results['entities']}"), output_path
+        with ThreadPoolExecutor() as executor:
+            future = executor.submit(processing_task)
+            try:
+                return future.result(timeout=300)  # 5 minutes timeout
+            except TimeoutError:
+                return "Processing timed out after 5 minutes.", None
     iface = gr.Interface(
         fn=process_and_display,
 if __name__ == "__main__":
     iface = create_gradio_interface()
     iface.launch()
+# Launch the Gradio app
+if __name__ == "__main__":
+    iface = create_gradio_interface()
+    iface.launch()