Spaces:

kopeck
/

ocrtest

Sleeping

App Files Files Community

kopeck commited on Sep 23

Commit

40b0c63

•

1 Parent(s): 5b1aa17

Upload 3 files

Browse files

Files changed (3) hide show

app.py +171 -148
packages.txt +2 -3
requirements.txt +6 -7

app.py CHANGED Viewed

@@ -1,148 +1,171 @@
-import gradio as gr
-from typing import Union, List
-import logging
-import tempfile
-import pytesseract
-import fitz  # PyMuPDF
-from PIL import Image
-import re
-import os
-import subprocess
-import sys
-from tqdm import tqdm
-import requests
-import json
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Claude API configuration
-CLAUDE_API_KEY = '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'
-CLAUDE_API_URL = "https://api.anthropic.com/v1/messages"
-def check_tesseract():
-    try:
-        version = subprocess.check_output(['tesseract', '--version']).decode('utf-8')
-        print(f"Tesseract is installed. Version: {version.split()[1]}")
-        return True
-    except FileNotFoundError:
-        print("Tesseract is not installed.")
-        return False
-# Run the check
-if not check_tesseract():
-    print("Tesseract is required for this application to run.")
-    sys.exit(1)
-def perform_ocr(file_content: bytes, lang: str) -> str:
-    """Perform OCR on the given PDF file content using Tesseract."""
-    try:
-        pdf_document = fitz.open(stream=file_content, filetype="pdf")
-        text = ""
-        for page in pdf_document:
-            pix = page.get_pixmap()
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            text += pytesseract.image_to_string(img, lang=lang)
-        return text
-    except Exception as e:
-        logger.error(f"Error performing OCR: {str(e)}")
-        return ""
-def process_with_claude(text: str) -> str:
-    """Process the scanned text with Claude."""
-    try:
-        headers = {
-            "Content-Type": "application/json",
-            "x-api-key": CLAUDE_API_KEY,
-        }
-        data = {
-            "messages": [
-                {"role": "system", "content": "You are an expert at summarizing and cleaning up OCR text. Your task is to summarize the given text, correct any obvious OCR errors, and improve readability."},
-                {"role": "user", "content": f"Please summarize and clean up the following OCR text: {text[:4000]}"}  # Limiting to 4000 chars to avoid token limits
-            ],
-            "max_tokens": 1000,
-            "model": "claude-2.1"
-        }
-        response = requests.post(CLAUDE_API_URL, headers=headers, data=json.dumps(data))
-        response.raise_for_status()
-        result = response.json()
-        return result['content'][0]['text']
-    except Exception as e:
-        logger.error(f"Error processing with Claude: {str(e)}")
-        return text
-def process_documents(files: List[Union[tempfile.SpooledTemporaryFile, gr.File]], lang: str) -> List[dict]:
-    """Process multiple documents and return the results."""
-    results = []
-    for file in tqdm(files, desc="Processing documents"):
-        try:
-            if isinstance(file, gr.File):
-                file_content = file.value
-            elif hasattr(file, 'read'):
-                file_content = file.read()
-            else:
-                file_content = file  # Assume it's already the file content
-            ocr_text = perform_ocr(file_content, lang)
-            processed_text = process_with_claude(ocr_text)
-            results.append({
-                "original": ocr_text[:500] + "...",
-                "processed": processed_text,
-            })
-        except Exception as e:
-            logger.error(f"Error processing document: {str(e)}")
-            results.append({
-                "error": f"Failed to process document: {str(e)}"
-            })
-    return results
-def format_results(results: List[dict]) -> str:
-    """Format the results for display."""
-    output = ""
-    for i, result in enumerate(results, 1):
-        output += f"Document {i}:\n"
-        if "error" in result:
-            output += f"Error: {result['error']}\n"
-        else:
-            output += f"Original Text (first 500 chars):\n{result['original']}\n\n"
-            output += f"Processed Text:\n{result['processed']}\n\n"
-        output += "-" * 50 + "\n\n"
-    return output
-def save_results(results: List[dict]) -> str:
-    """Save the results to a file and return the file path."""
-    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
-        temp_file.write(format_results(results))
-    return temp_file.name
-def process_and_display(files, lang):
-    results = process_documents(files, lang)
-    formatted_results = format_results(results)
-    file_path = save_results(results)
-    return formatted_results, file_path
-# Gradio interface
-iface = gr.Interface(
-    fn=process_and_display,
-    inputs=[
-        gr.File(label="Upload PDF Documents", file_count="multiple", type="binary"),
-        gr.Dropdown(choices=["eng", "fra", "deu", "spa"], label="OCR Language", value="eng"),
-    ],
-    outputs=[
-        gr.Textbox(label="Processed Text", lines=20),
-        gr.File(label="Download Results")
-    ],
-    title="Claude-Enhanced Document OCR and Processing Tool",
-    description="Upload PDF documents to scan, process, and clean the text using Claude AI.",
-    allow_flagging="never"
-)
-# Launch the Gradio app
-if __name__ == "__main__":
-    iface.launch()

+import gradio as gr
+from typing import Dict
+import logging
+import tempfile
+import io
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from pdf2image import convert_from_bytes
+from PIL import Image
+import pytesseract
+import docx2txt
+from reportlab.lib.pagesizes import letter
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
+from reportlab.lib.styles import getSampleStyleSheet
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class AdvancedDocProcessor:
+    def __init__(self):
+        # Initialize BART model for text cleaning and summarization
+        self.bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
+        self.bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+        # Initialize T5 model for text generation tasks
+        self.t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
+        self.t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
+        # Initialize pipeline for named entity recognition
+        self.ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
+    def extract_text(self, file_content: bytes, file_type: str) -> str:
+        """Extract text from various file types."""
+        try:
+            if file_type == "application/pdf":
+                return self.extract_text_from_pdf(file_content)
+            elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+                return self.extract_text_from_docx(file_content)
+            elif file_type == "text/plain":
+                return file_content.decode('utf-8')
+            else:
+                raise ValueError(f"Unsupported file type: {file_type}")
+        except Exception as e:
+            logger.error(f"Error extracting text: {str(e)}")
+            return ""
+    def extract_text_from_pdf(self, pdf_content: bytes) -> str:
+        """Extract text from PDF using OCR."""
+        images = convert_from_bytes(pdf_content)
+        text = ""
+        for image in images:
+            text += pytesseract.image_to_string(image)
+        return text
+    def extract_text_from_docx(self, docx_content: bytes) -> str:
+        """Extract text from a DOCX file."""
+        return docx2txt.process(io.BytesIO(docx_content))
+    def clean_and_summarize_text(self, text: str) -> str:
+        """Clean and summarize the text using BART."""
+        inputs = self.bart_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
+        summary_ids = self.bart_model.generate(inputs["input_ids"], num_beams=4, max_length=150, early_stopping=True)
+        return self.bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    def process_with_t5(self, text: str, prompt: str) -> str:
+        """Process the text with T5 based on the given prompt."""
+        input_text = f"{prompt} {text}"
+        input_ids = self.t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
+        outputs = self.t5_model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=0.7)
+        return self.t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    def extract_entities(self, text: str) -> str:
+        """Extract named entities from the text."""
+        entities = self.ner_pipeline(text)
+        unique_entities = set((ent['word'], ent['entity']) for ent in entities)
+        return "\n".join([f"{word} ({entity})" for word, entity in unique_entities])
+    def process_document(self, file_content: bytes, file_type: str, prompt: str) -> Dict[str, str]:
+        raw_text = self.extract_text(file_content, file_type)
+        cleaned_text = self.clean_and_summarize_text(raw_text)
+        processed_text = self.process_with_t5(cleaned_text, prompt)
+        entities = self.extract_entities(raw_text)
+        return {
+            "original": raw_text,
+            "cleaned": cleaned_text,
+            "processed": processed_text,
+            "entities": entities
+        }
+def infer_file_type(file_content: bytes) -> str:
+    """Infer the file type from the byte content."""
+    if file_content.startswith(b'%PDF'):
+        return "application/pdf"
+    elif file_content.startswith(b'PK\x03\x04'):
+        return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    else:
+        return "text/plain"
+def create_gradio_interface():
+    processor = AdvancedDocProcessor()
+    def process_and_display(file, prompt, output_format):
+        file_content = file
+        file_type = infer_file_type(file_content)
+        results = processor.process_document(file_content, file_type, prompt)
+        if output_format == "txt":
+            output_path = save_as_txt(results)
+        elif output_format == "docx":
+            output_path = save_as_docx(results)
+        else:  # pdf
+            output_path = save_as_pdf(results)
+        return (f"Original Text (first 500 chars):\n{results['original'][:500]}...\n\n"
+                f"Cleaned and Summarized Text:\n{results['cleaned']}\n\n"
+                f"Processed Text:\n{results['processed']}\n\n"
+                f"Extracted Entities:\n{results['entities']}"), output_path
+    iface = gr.Interface(
+        fn=process_and_display,
+        inputs=[
+            gr.File(label="Upload Document (PDF, DOCX, or TXT)", type="binary"),
+            gr.Textbox(label="Enter your prompt for processing", lines=3),
+            gr.Radio(["txt", "docx", "pdf"], label="Output Format", value="txt")
+        ],
+        outputs=[
+            gr.Textbox(label="Processing Results", lines=30),
+            gr.File(label="Download Processed Document")
+        ],
+        title="Advanced Document Processing Tool",
+        description="Upload a document (PDF, DOCX, or TXT) and enter a prompt to process and analyze the text using state-of-the-art NLP models.",
+    )
+    return iface
+def save_as_txt(results: Dict[str, str]) -> str:
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
+        for key, value in results.items():
+            temp_file.write(f"{key.upper()}:\n{value}\n\n")
+    return temp_file.name
+def save_as_docx(results: Dict[str, str]) -> str:
+    doc = docx.Document()
+    for key, value in results.items():
+        doc.add_heading(key.capitalize(), level=1)
+        doc.add_paragraph(value)
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp:
+        doc.save(tmp.name)
+    return tmp.name
+def save_as_pdf(results: Dict[str, str]) -> str:
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
+        doc = SimpleDocTemplate(tmp.name, pagesize=letter)
+        styles = getSampleStyleSheet()
+        story = []
+        for key, value in results.items():
+            story.append(Paragraph(key.capitalize(), styles['Heading1']))
+            story.append(Paragraph(value, styles['BodyText']))
+            story.append(Spacer(1, 12))
+        doc.build(story)
+    return tmp.name
+# Launch the Gradio app
+if __name__ == "__main__":
+    iface = create_gradio_interface()
+    iface.launch()

packages.txt CHANGED Viewed

@@ -1,3 +1,2 @@
-tesseract-ocr
-libtesseract-dev
-libleptonica-dev


1	+ tesseract-ocr
2	+ libtesseract-dev

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
-gradio
-pytesseract
-PyMuPDF
-Pillow
-torch
-transformers
-tqdm

+gradio
+pytesseract
+PyMuPDF
+Pillow
+torch
+transformers