sprakhil commited on
Commit
6d4955e
1 Parent(s): 1512254

using gradio instead of streamlit

Browse files
Files changed (2) hide show
  1. app.py +37 -31
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import streamlit as st
2
  from PIL import Image
3
  import torch
4
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, pipeline
@@ -19,44 +19,28 @@ login(token=hf_token)
19
  try:
20
  image_to_text_pipeline = pipeline("image-to-text", model="google/paligemma-3b-mix-448", device=0 if torch.cuda.is_available() else -1)
21
  except Exception as e:
22
- st.error(f"Error loading image-to-text model: {e}")
23
- st.stop()
24
 
25
  # Load ColPali model with Hugging Face token
26
  try:
27
  model_colpali = ColPali.from_pretrained("vidore/colpali-v1.2", torch_dtype=torch.bfloat16).to(device)
28
  processor_colpali = ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448")
29
  except Exception as e:
30
- st.error(f"Error loading ColPali model or processor: {e}")
31
- st.stop()
32
 
33
  # Load Qwen model
34
  try:
35
  model_qwen = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct").to(device)
36
  processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
37
  except Exception as e:
38
- st.error(f"Error loading Qwen model or processor: {e}")
39
- st.stop()
40
 
41
- # Streamlit UI
42
- st.title("OCR and Document Search Web Application")
43
- st.write("Upload an image containing text in both Hindi and English for OCR processing and keyword search.")
44
-
45
- # File uploader for the image
46
- uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
47
-
48
- if uploaded_file is not None:
49
  try:
50
- image = Image.open(uploaded_file)
51
- st.image(image, caption='Uploaded Image.', use_column_width=True)
52
- st.write("")
53
-
54
  # Use the image-to-text pipeline to extract text from the image
55
  output_text_img_to_text = image_to_text_pipeline(image)
56
 
57
- st.write("Extracted Text from Image:")
58
- st.write(output_text_img_to_text)
59
-
60
  # Prepare input for Qwen model for image description
61
  conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
62
  text_prompt = processor_qwen.apply_chat_template(conversation, add_generation_prompt=True)
@@ -68,18 +52,40 @@ if uploaded_file is not None:
68
  generated_ids_qwen = [output_ids_qwen[len(input_ids):] for input_ids, output_ids_qwen in zip(inputs_qwen.input_ids, output_ids_qwen)]
69
  output_text_qwen = processor_qwen.batch_decode(generated_ids_qwen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
70
 
71
- st.write("Qwen Model Description:")
72
- st.write(output_text_qwen)
73
 
74
  # Keyword search in the extracted text
75
- keyword = st.text_input("Enter a keyword to search in the extracted text:")
76
  if keyword:
77
- if keyword.lower() in output_text_img_to_text[0]['generated_text'].lower():
78
- st.write(f"Keyword '{keyword}' found in the text.")
79
  else:
80
- st.write(f"Keyword '{keyword}' not found in the text.")
81
- except Exception as e:
82
- st.error(f"An error occurred: {e}")
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if __name__ == "__main__":
85
- st.write("Deploying the web application...")
 
1
+ import gradio as gr
2
  from PIL import Image
3
  import torch
4
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, pipeline
 
19
  try:
20
  image_to_text_pipeline = pipeline("image-to-text", model="google/paligemma-3b-mix-448", device=0 if torch.cuda.is_available() else -1)
21
  except Exception as e:
22
+ raise Exception(f"Error loading image-to-text model: {e}")
 
23
 
24
  # Load ColPali model with Hugging Face token
25
  try:
26
  model_colpali = ColPali.from_pretrained("vidore/colpali-v1.2", torch_dtype=torch.bfloat16).to(device)
27
  processor_colpali = ColPaliProcessor.from_pretrained("google/paligemma-3b-mix-448")
28
  except Exception as e:
29
+ raise Exception(f"Error loading ColPali model or processor: {e}")
 
30
 
31
  # Load Qwen model
32
  try:
33
  model_qwen = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct").to(device)
34
  processor_qwen = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
35
  except Exception as e:
36
+ raise Exception(f"Error loading Qwen model or processor: {e}")
 
37
 
38
+ # Function to process the image and extract text
39
+ def process_image(image, keyword):
 
 
 
 
 
 
40
  try:
 
 
 
 
41
  # Use the image-to-text pipeline to extract text from the image
42
  output_text_img_to_text = image_to_text_pipeline(image)
43
 
 
 
 
44
  # Prepare input for Qwen model for image description
45
  conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
46
  text_prompt = processor_qwen.apply_chat_template(conversation, add_generation_prompt=True)
 
52
  generated_ids_qwen = [output_ids_qwen[len(input_ids):] for input_ids, output_ids_qwen in zip(inputs_qwen.input_ids, output_ids_qwen)]
53
  output_text_qwen = processor_qwen.batch_decode(generated_ids_qwen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
54
 
55
+ extracted_text = output_text_img_to_text[0]['generated_text']
 
56
 
57
  # Keyword search in the extracted text
58
+ keyword_found = ""
59
  if keyword:
60
+ if keyword.lower() in extracted_text.lower():
61
+ keyword_found = f"Keyword '{keyword}' found in the text."
62
  else:
63
+ keyword_found = f"Keyword '{keyword}' not found in the text."
 
 
64
 
65
+ return extracted_text, output_text_qwen[0], keyword_found
66
+ except Exception as e:
67
+ return str(e), "", ""
68
+
69
+ # Define Gradio Interface
70
+ title = "OCR and Document Search Web Application"
71
+ description = "Upload an image containing text in both Hindi and English for OCR processing and keyword search."
72
+
73
+ # Gradio interface for input and output
74
+ image_input = gr.inputs.Image(type="pil")
75
+ keyword_input = gr.inputs.Textbox(label="Enter a keyword to search in the extracted text (Optional)")
76
+ output_textbox = gr.outputs.Textbox(label="Extracted Text")
77
+ output_description = gr.outputs.Textbox(label="Qwen Model Description")
78
+ output_keyword_search = gr.outputs.Textbox(label="Keyword Search Result")
79
+
80
+ # Set up Gradio interface layout
81
+ interface = gr.Interface(
82
+ fn=process_image, # Function to call when button is pressed
83
+ inputs=[image_input, keyword_input], # Input types (image and keyword)
84
+ outputs=[output_textbox, output_description, output_keyword_search], # Outputs (text boxes for results)
85
+ title=title,
86
+ description=description
87
+ )
88
+
89
+ # Launch the Gradio app
90
  if __name__ == "__main__":
91
+ interface.launch(share=True)
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- streamlit
2
  Pillow
3
  torch
4
  transformers
 
1
+ gradio
2
  Pillow
3
  torch
4
  transformers