Spaces:

kopeck
/

ocrtest

Sleeping

App Files Files Community

kopeck commited on Sep 23

Commit

a5a2bc4

•

1 Parent(s): 30cb801

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -4

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import tempfile
 import io
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from pdf2image import convert_from_bytes
 from PIL import Image
@@ -24,14 +25,14 @@ class AdvancedDocProcessor:
     def __init__(self):
         # Initialize BART model for text cleaning and summarization
         self.bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
-        self.bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
         # Initialize T5 model for text generation tasks
         self.t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
-        self.t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
         # Initialize pipeline for named entity recognition
-        self.ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
     def extract_text(self, file_content: bytes, file_type: str) -> str:
         """Extract text from various file types."""
@@ -137,7 +138,12 @@ def create_gradio_interface():
     def process_and_display(file, prompt, output_format):
         def processing_task():
-            file_content = file.read()  # Read file content
             file_type = infer_file_type(file_content)
             results = processor.process_document(file_content, file_type, prompt)
@@ -158,6 +164,9 @@ def create_gradio_interface():
                 return future.result(timeout=300)  # 5 minutes timeout
             except TimeoutError:
                 return "Processing timed out after 5 minutes.", None
     iface = gr.Interface(
         fn=process_and_display,
@@ -217,5 +226,8 @@ def save_as_pdf(results: Dict[str, str]) -> str:
 # Launch the Gradio app
 if __name__ == "__main__":
     iface = create_gradio_interface()
     iface.launch()

 import tempfile
 import io
 import torch
+import numpy as np
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from pdf2image import convert_from_bytes
 from PIL import Image
     def __init__(self):
         # Initialize BART model for text cleaning and summarization
         self.bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
+        self.bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn", torch_dtype=torch.float32)
         # Initialize T5 model for text generation tasks
         self.t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
+        self.t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.float32)
         # Initialize pipeline for named entity recognition
+        self.ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", torch_dtype=torch.float32)
     def extract_text(self, file_content: bytes, file_type: str) -> str:
         """Extract text from various file types."""
     def process_and_display(file, prompt, output_format):
         def processing_task():
+            if isinstance(file, str):  # If it's a file path
+                with open(file, 'rb') as f:
+                    file_content = f.read()
+            else:  # If it's already file content
+                file_content = file
             file_type = infer_file_type(file_content)
             results = processor.process_document(file_content, file_type, prompt)
                 return future.result(timeout=300)  # 5 minutes timeout
             except TimeoutError:
                 return "Processing timed out after 5 minutes.", None
+            except Exception as e:
+                logger.error(f"Error during processing: {str(e)}")
+                return f"An error occurred during processing: {str(e)}", None
     iface = gr.Interface(
         fn=process_and_display,
 # Launch the Gradio app
 if __name__ == "__main__":
+    # Set NumPy print options to avoid warnings
+    np.set_printoptions(legacy='1.13')
     iface = create_gradio_interface()
     iface.launch()