kopeck commited on
Commit
40b0c63
1 Parent(s): 5b1aa17

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +171 -148
  2. packages.txt +2 -3
  3. requirements.txt +6 -7
app.py CHANGED
@@ -1,148 +1,171 @@
1
- import gradio as gr
2
- from typing import Union, List
3
- import logging
4
- import tempfile
5
- import pytesseract
6
- import fitz # PyMuPDF
7
- from PIL import Image
8
- import re
9
- import os
10
- import subprocess
11
- import sys
12
- from tqdm import tqdm
13
- import requests
14
- import json
15
-
16
- # Set up logging
17
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
- logger = logging.getLogger(__name__)
19
-
20
- # Claude API configuration
21
- CLAUDE_API_KEY = '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'
22
- CLAUDE_API_URL = "https://api.anthropic.com/v1/messages"
23
-
24
- def check_tesseract():
25
- try:
26
- version = subprocess.check_output(['tesseract', '--version']).decode('utf-8')
27
- print(f"Tesseract is installed. Version: {version.split()[1]}")
28
- return True
29
- except FileNotFoundError:
30
- print("Tesseract is not installed.")
31
- return False
32
-
33
- # Run the check
34
- if not check_tesseract():
35
- print("Tesseract is required for this application to run.")
36
- sys.exit(1)
37
-
38
- def perform_ocr(file_content: bytes, lang: str) -> str:
39
- """Perform OCR on the given PDF file content using Tesseract."""
40
- try:
41
- pdf_document = fitz.open(stream=file_content, filetype="pdf")
42
- text = ""
43
- for page in pdf_document:
44
- pix = page.get_pixmap()
45
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
46
- text += pytesseract.image_to_string(img, lang=lang)
47
- return text
48
- except Exception as e:
49
- logger.error(f"Error performing OCR: {str(e)}")
50
- return ""
51
-
52
- def process_with_claude(text: str) -> str:
53
- """Process the scanned text with Claude."""
54
- try:
55
- headers = {
56
- "Content-Type": "application/json",
57
- "x-api-key": CLAUDE_API_KEY,
58
- }
59
-
60
- data = {
61
- "messages": [
62
- {"role": "system", "content": "You are an expert at summarizing and cleaning up OCR text. Your task is to summarize the given text, correct any obvious OCR errors, and improve readability."},
63
- {"role": "user", "content": f"Please summarize and clean up the following OCR text: {text[:4000]}"} # Limiting to 4000 chars to avoid token limits
64
- ],
65
- "max_tokens": 1000,
66
- "model": "claude-2.1"
67
- }
68
-
69
- response = requests.post(CLAUDE_API_URL, headers=headers, data=json.dumps(data))
70
- response.raise_for_status()
71
-
72
- result = response.json()
73
- return result['content'][0]['text']
74
- except Exception as e:
75
- logger.error(f"Error processing with Claude: {str(e)}")
76
- return text
77
-
78
- def process_documents(files: List[Union[tempfile.SpooledTemporaryFile, gr.File]], lang: str) -> List[dict]:
79
- """Process multiple documents and return the results."""
80
- results = []
81
- for file in tqdm(files, desc="Processing documents"):
82
- try:
83
- if isinstance(file, gr.File):
84
- file_content = file.value
85
- elif hasattr(file, 'read'):
86
- file_content = file.read()
87
- else:
88
- file_content = file # Assume it's already the file content
89
-
90
- ocr_text = perform_ocr(file_content, lang)
91
- processed_text = process_with_claude(ocr_text)
92
-
93
- results.append({
94
- "original": ocr_text[:500] + "...",
95
- "processed": processed_text,
96
- })
97
- except Exception as e:
98
- logger.error(f"Error processing document: {str(e)}")
99
- results.append({
100
- "error": f"Failed to process document: {str(e)}"
101
- })
102
-
103
- return results
104
-
105
- def format_results(results: List[dict]) -> str:
106
- """Format the results for display."""
107
- output = ""
108
- for i, result in enumerate(results, 1):
109
- output += f"Document {i}:\n"
110
- if "error" in result:
111
- output += f"Error: {result['error']}\n"
112
- else:
113
- output += f"Original Text (first 500 chars):\n{result['original']}\n\n"
114
- output += f"Processed Text:\n{result['processed']}\n\n"
115
- output += "-" * 50 + "\n\n"
116
- return output
117
-
118
- def save_results(results: List[dict]) -> str:
119
- """Save the results to a file and return the file path."""
120
- with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
121
- temp_file.write(format_results(results))
122
- return temp_file.name
123
-
124
- def process_and_display(files, lang):
125
- results = process_documents(files, lang)
126
- formatted_results = format_results(results)
127
- file_path = save_results(results)
128
- return formatted_results, file_path
129
-
130
- # Gradio interface
131
- iface = gr.Interface(
132
- fn=process_and_display,
133
- inputs=[
134
- gr.File(label="Upload PDF Documents", file_count="multiple", type="binary"),
135
- gr.Dropdown(choices=["eng", "fra", "deu", "spa"], label="OCR Language", value="eng"),
136
- ],
137
- outputs=[
138
- gr.Textbox(label="Processed Text", lines=20),
139
- gr.File(label="Download Results")
140
- ],
141
- title="Claude-Enhanced Document OCR and Processing Tool",
142
- description="Upload PDF documents to scan, process, and clean the text using Claude AI.",
143
- allow_flagging="never"
144
- )
145
-
146
- # Launch the Gradio app
147
- if __name__ == "__main__":
148
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import Dict
3
+ import logging
4
+ import tempfile
5
+ import io
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
8
+ from pdf2image import convert_from_bytes
9
+ from PIL import Image
10
+ import pytesseract
11
+ import docx2txt
12
+ from reportlab.lib.pagesizes import letter
13
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
14
+ from reportlab.lib.styles import getSampleStyleSheet
15
+
16
+ # Set up logging
17
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
+ logger = logging.getLogger(__name__)
19
+
20
+ class AdvancedDocProcessor:
21
+ def __init__(self):
22
+ # Initialize BART model for text cleaning and summarization
23
+ self.bart_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
24
+ self.bart_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
25
+
26
+ # Initialize T5 model for text generation tasks
27
+ self.t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
28
+ self.t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
29
+
30
+ # Initialize pipeline for named entity recognition
31
+ self.ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
32
+
33
+ def extract_text(self, file_content: bytes, file_type: str) -> str:
34
+ """Extract text from various file types."""
35
+ try:
36
+ if file_type == "application/pdf":
37
+ return self.extract_text_from_pdf(file_content)
38
+ elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
39
+ return self.extract_text_from_docx(file_content)
40
+ elif file_type == "text/plain":
41
+ return file_content.decode('utf-8')
42
+ else:
43
+ raise ValueError(f"Unsupported file type: {file_type}")
44
+ except Exception as e:
45
+ logger.error(f"Error extracting text: {str(e)}")
46
+ return ""
47
+
48
+ def extract_text_from_pdf(self, pdf_content: bytes) -> str:
49
+ """Extract text from PDF using OCR."""
50
+ images = convert_from_bytes(pdf_content)
51
+ text = ""
52
+ for image in images:
53
+ text += pytesseract.image_to_string(image)
54
+ return text
55
+
56
+ def extract_text_from_docx(self, docx_content: bytes) -> str:
57
+ """Extract text from a DOCX file."""
58
+ return docx2txt.process(io.BytesIO(docx_content))
59
+
60
+ def clean_and_summarize_text(self, text: str) -> str:
61
+ """Clean and summarize the text using BART."""
62
+ inputs = self.bart_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
63
+ summary_ids = self.bart_model.generate(inputs["input_ids"], num_beams=4, max_length=150, early_stopping=True)
64
+ return self.bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
65
+
66
+ def process_with_t5(self, text: str, prompt: str) -> str:
67
+ """Process the text with T5 based on the given prompt."""
68
+ input_text = f"{prompt} {text}"
69
+ input_ids = self.t5_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
70
+ outputs = self.t5_model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=0.7)
71
+ return self.t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
72
+
73
+ def extract_entities(self, text: str) -> str:
74
+ """Extract named entities from the text."""
75
+ entities = self.ner_pipeline(text)
76
+ unique_entities = set((ent['word'], ent['entity']) for ent in entities)
77
+ return "\n".join([f"{word} ({entity})" for word, entity in unique_entities])
78
+
79
+ def process_document(self, file_content: bytes, file_type: str, prompt: str) -> Dict[str, str]:
80
+ raw_text = self.extract_text(file_content, file_type)
81
+ cleaned_text = self.clean_and_summarize_text(raw_text)
82
+ processed_text = self.process_with_t5(cleaned_text, prompt)
83
+ entities = self.extract_entities(raw_text)
84
+
85
+ return {
86
+ "original": raw_text,
87
+ "cleaned": cleaned_text,
88
+ "processed": processed_text,
89
+ "entities": entities
90
+ }
91
+
92
+ def infer_file_type(file_content: bytes) -> str:
93
+ """Infer the file type from the byte content."""
94
+ if file_content.startswith(b'%PDF'):
95
+ return "application/pdf"
96
+ elif file_content.startswith(b'PK\x03\x04'):
97
+ return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
98
+ else:
99
+ return "text/plain"
100
+
101
+ def create_gradio_interface():
102
+ processor = AdvancedDocProcessor()
103
+
104
+ def process_and_display(file, prompt, output_format):
105
+ file_content = file
106
+ file_type = infer_file_type(file_content)
107
+ results = processor.process_document(file_content, file_type, prompt)
108
+
109
+ if output_format == "txt":
110
+ output_path = save_as_txt(results)
111
+ elif output_format == "docx":
112
+ output_path = save_as_docx(results)
113
+ else: # pdf
114
+ output_path = save_as_pdf(results)
115
+
116
+ return (f"Original Text (first 500 chars):\n{results['original'][:500]}...\n\n"
117
+ f"Cleaned and Summarized Text:\n{results['cleaned']}\n\n"
118
+ f"Processed Text:\n{results['processed']}\n\n"
119
+ f"Extracted Entities:\n{results['entities']}"), output_path
120
+
121
+ iface = gr.Interface(
122
+ fn=process_and_display,
123
+ inputs=[
124
+ gr.File(label="Upload Document (PDF, DOCX, or TXT)", type="binary"),
125
+ gr.Textbox(label="Enter your prompt for processing", lines=3),
126
+ gr.Radio(["txt", "docx", "pdf"], label="Output Format", value="txt")
127
+ ],
128
+ outputs=[
129
+ gr.Textbox(label="Processing Results", lines=30),
130
+ gr.File(label="Download Processed Document")
131
+ ],
132
+ title="Advanced Document Processing Tool",
133
+ description="Upload a document (PDF, DOCX, or TXT) and enter a prompt to process and analyze the text using state-of-the-art NLP models.",
134
+ )
135
+
136
+ return iface
137
+
138
+ def save_as_txt(results: Dict[str, str]) -> str:
139
+ with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
140
+ for key, value in results.items():
141
+ temp_file.write(f"{key.upper()}:\n{value}\n\n")
142
+ return temp_file.name
143
+
144
+ def save_as_docx(results: Dict[str, str]) -> str:
145
+ doc = docx.Document()
146
+ for key, value in results.items():
147
+ doc.add_heading(key.capitalize(), level=1)
148
+ doc.add_paragraph(value)
149
+
150
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp:
151
+ doc.save(tmp.name)
152
+ return tmp.name
153
+
154
+ def save_as_pdf(results: Dict[str, str]) -> str:
155
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
156
+ doc = SimpleDocTemplate(tmp.name, pagesize=letter)
157
+ styles = getSampleStyleSheet()
158
+ story = []
159
+
160
+ for key, value in results.items():
161
+ story.append(Paragraph(key.capitalize(), styles['Heading1']))
162
+ story.append(Paragraph(value, styles['BodyText']))
163
+ story.append(Spacer(1, 12))
164
+
165
+ doc.build(story)
166
+ return tmp.name
167
+
168
+ # Launch the Gradio app
169
+ if __name__ == "__main__":
170
+ iface = create_gradio_interface()
171
+ iface.launch()
packages.txt CHANGED
@@ -1,3 +1,2 @@
1
- tesseract-ocr
2
- libtesseract-dev
3
- libleptonica-dev
 
1
+ tesseract-ocr
2
+ libtesseract-dev
 
requirements.txt CHANGED
@@ -1,7 +1,6 @@
1
- gradio
2
- pytesseract
3
- PyMuPDF
4
- Pillow
5
- torch
6
- transformers
7
- tqdm
 
1
+ gradio
2
+ pytesseract
3
+ PyMuPDF
4
+ Pillow
5
+ torch
6
+ transformers