Ni3SinghR commited on
Commit
e535f9e
1 Parent(s): a4c6702

Upload ocr_project.py

Browse files
Files changed (1) hide show
  1. ocr_project.py +51 -0
ocr_project.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import easyocr
3
+ from transformers import AutoModel, AutoTokenizer
4
+ from PIL import Image
5
+ import warnings
6
+ from transformers import logging
7
+ import re
8
+
9
+ #To Surpaas warnings
10
+ warnings.filterwarnings("ignore", message="The attention mask and the pad token id were not set.")
11
+ warnings.filterwarnings("ignore", message="Setting `pad_token_id` to `eos_token_id`")
12
+ warnings.filterwarnings("ignore", message="The `seen_tokens` attribute is deprecated")
13
+
14
+ logging.set_verbosity_error()
15
+
16
+
17
+ tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
18
+ model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, low_cpu_mem_usage=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
19
+ model = model.eval()
20
+
21
+
22
+ easyocr_reader = easyocr.Reader(['hi'], gpu=False)
23
+
24
+ # Function to perform OCR based on selected language
25
+ def perform_ocr(image, language):
26
+ if language == "Hindi":
27
+ image_np = np.array(image)
28
+ result = easyocr_reader.readtext(image_np, detail=0)
29
+ return ' '.join(result)
30
+ elif language == "English":
31
+ image_path = 'temp_image.png'
32
+ image.save(image_path)
33
+ result = model.chat(tokenizer, image_path, ocr_type='ocr')
34
+ return result
35
+ else:
36
+ return "Invalid language selection. Please choose Hindi or English."
37
+
38
+ def process_keyword(image, language, keyword):
39
+ extracted_text = perform_ocr(image, language)
40
+ if keyword:
41
+ keyword_regex = re.escape(keyword)
42
+ highlighted_text = re.sub(
43
+ f'({keyword_regex})', r'<mark style="background-color: yellow">\1</mark>', extracted_text, flags=re.IGNORECASE
44
+ )
45
+
46
+ if highlighted_text != extracted_text:
47
+ return highlighted_text
48
+ else:
49
+ return f"No keyword '{keyword}' found in the text."
50
+ else:
51
+ return extracted_text