streamlit_qwen2_withbyaldi

Sleeping

App Files Files Community

lukiod commited on Sep 27

Commit

0ae684c

•

1 Parent(s): fff6204

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -42

app.py CHANGED Viewed

@@ -1,62 +1,124 @@
 import streamlit as st
 from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
-from PIL import Image
-import torch
-# Load the model and processor
 @st.cache_resource
-def load_model():
-    # Load Qwen2-VL-7B on CPU
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float32, device_map="cpu"
     )
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
     return model, processor
-model, processor = load_model()
 # Streamlit Interface
-st.title("Qwen2-VL-7B Multimodal Demo")
-st.write("Upload an image and provide a text prompt to see the model's response.")
 # Image uploader
 image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
-# Text input field
-text = st.text_input("Enter a text description or query")
-# If both image and text are provided
-if image and text:
-    # Load image with PIL
     img = Image.open(image)
     st.image(img, caption="Uploaded Image", use_column_width=True)
-    # Prepare inputs for Qwen2-VL
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": img},
-                {"type": "text", "text": text},
-            ],
-        }
-    ]
-    # Prepare for inference
-    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, _ = process_vision_info(messages)
-    inputs = processor(text=[text_input], images=image_inputs, padding=True, return_tensors="pt")
-    # Move tensors to CPU
-    inputs = inputs.to("cpu")
-    # Run the model and generate output
-    with torch.no_grad():
-        generated_ids = model.generate(**inputs, max_new_tokens=128)
-    # Decode the output text
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-    # Display the response
-    st.write("Model's response:", generated_text)

 import streamlit as st
+import torch
+from PIL import Image
+import gc
 from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
+from colpali_engine.models.paligemma_colbert_architecture import ColPali
+from colpali_engine.utils.colpali_processing_utils import process_images, process_queries
+from torch.utils.data import DataLoader
+# Function to load Colpali model
+@st.cache_resource
+def load_colpali_model():
+    model = ColPali.from_pretrained("vidore/colpaligemma-3b-mix-448-base", torch_dtype=torch.float32, device_map="cpu").eval()
+    model.load_adapter("vidore/colpali")
+    processor = AutoProcessor.from_pretrained("vidore/colpali")
+    return model, processor
+# Function to load Qwen2-VL model
 @st.cache_resource
+def load_qwen_model():
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         "Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch.float32, device_map="cpu"
     )
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
     return model, processor
+# Function to clear GPU memory
+def clear_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
 # Streamlit Interface
+st.title("OCR and Visual Language Model Demo")
+st.write("Upload an image for OCR extraction and then ask a question about the image.")
 # Image uploader
 image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+if image:
     img = Image.open(image)
     st.image(img, caption="Uploaded Image", use_column_width=True)
+    # OCR Extraction with Colpali
+    st.write("Extracting text from image...")
+    colpali_model, colpali_processor = load_colpali_model()
+    # Process image for Colpali
+    dataloader = DataLoader(
+        [img],
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: process_images(colpali_processor, x),
+    )
+    for batch_doc in dataloader:
+        with torch.no_grad():
+            batch_doc = {k: v.to('cpu') for k, v in batch_doc.items()}
+            embeddings_doc = colpali_model(**batch_doc)
+    # For simplicity, we'll use a dummy query to extract text
+    dummy_query = "Extract all text from the image"
+    query_dataloader = DataLoader(
+        [dummy_query],
+        batch_size=1,
+        shuffle=False,
+        collate_fn=lambda x: process_queries(colpali_processor, x, Image.new("RGB", (448, 448), (255, 255, 255))),
+    )
+    for batch_query in query_dataloader:
+        with torch.no_grad():
+            batch_query = {k: v.to('cpu') for k, v in batch_query.items()}
+            embeddings_query = colpali_model(**batch_query)
+    # In a real scenario, you'd use these embeddings to extract text
+    # For this demo, we'll just show a placeholder text
+    extracted_text = "This is a placeholder for the extracted text. In a real scenario, you would use the embeddings to extract actual text from the image."
+    st.write("Extracted Text:")
+    st.write(extracted_text)
+    # Clear Colpali model from memory
+    del colpali_model, colpali_processor
+    clear_memory()
+    # Text input field for question
+    question = st.text_input("Ask a question about the image and extracted text")
+    if question:
+        st.write("Processing with Qwen2-VL...")
+        qwen_model, qwen_processor = load_qwen_model()
+        # Prepare inputs for Qwen2-VL
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": img},
+                    {"type": "text", "text": f"Extracted text: {extracted_text}\n\nQuestion: {question}"},
+                ],
+            }
+        ]
+        # Prepare for inference
+        text_input = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, _ = process_vision_info(messages)
+        inputs = qwen_processor(text=[text_input], images=image_inputs, padding=True, return_tensors="pt")
+        # Move tensors to CPU
+        inputs = inputs.to("cpu")
+        # Run the model and generate output
+        with torch.no_grad():
+            generated_ids = qwen_model.generate(**inputs, max_new_tokens=128)
+        # Decode the output text
+        generated_text = qwen_processor.batch_decode(generated_ids, skip_special_tokens=True)
+        # Display the response
+        st.write("Model's response:", generated_text)
+        # Clear Qwen model from memory
+        del qwen_model, qwen_processor
+        clear_memory()