streamlit_qwen2_withbyaldi

Sleeping

App Files Files Community

lukiod commited on Sep 27

Commit

be730b6

•

1 Parent(s): 9353556

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -42

app.py CHANGED Viewed

@@ -2,19 +2,15 @@ import streamlit as st
 import torch
 from PIL import Image
 import gc
-from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
-from colpali_engine.models.paligemma_colbert_architecture import ColPali
-from colpali_engine.utils.colpali_processing_utils import process_images, process_queries
-from torch.utils.data import DataLoader
-# Function to load Colpali model
 @st.cache_resource
-def load_colpali_model():
-    model = ColPali.from_pretrained("vidore/colpaligemma-3b-mix-448-base", torch_dtype=torch.float32, device_map="cpu").eval()
-    model.load_adapter("vidore/colpali")
-    processor = AutoProcessor.from_pretrained("vidore/colpali")
-    return model, processor
 # Function to load Qwen2-VL model
 @st.cache_resource
@@ -41,46 +37,28 @@ if image:
     img = Image.open(image)
     st.image(img, caption="Uploaded Image", use_column_width=True)
-    # OCR Extraction with Colpali
     st.write("Extracting text from image...")
-    colpali_model, colpali_processor = load_colpali_model()
-    # Process image for Colpali
-    dataloader = DataLoader(
-        [img],
-        batch_size=1,
-        shuffle=False,
-        collate_fn=lambda x: process_images(colpali_processor, x),
-    )
-    for batch_doc in dataloader:
-        with torch.no_grad():
-            batch_doc = {k: v.to('cpu') for k, v in batch_doc.items()}
-            embeddings_doc = colpali_model(**batch_doc)
-    # For simplicity, we'll use a dummy query to extract text
-    dummy_query = "Extract all text from the image"
-    query_dataloader = DataLoader(
-        [dummy_query],
-        batch_size=1,
-        shuffle=False,
-        collate_fn=lambda x: process_queries(colpali_processor, x, Image.new("RGB", (448, 448), (255, 255, 255))),
-    )
-    for batch_query in query_dataloader:
-        with torch.no_grad():
-            batch_query = {k: v.to('cpu') for k, v in batch_query.items()}
-            embeddings_query = colpali_model(**batch_query)
-    # In a real scenario, you'd use these embeddings to extract text
-    # For this demo, we'll just show a placeholder text
-    extracted_text = "This is a placeholder for the extracted text. In a real scenario, you would use the embeddings to extract actual text from the image."
     st.write("Extracted Text:")
     st.write(extracted_text)
-    # Clear Colpali model from memory
-    del colpali_model, colpali_processor
     clear_memory()
     # Text input field for question

 import torch
 from PIL import Image
 import gc
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
+from byaldi import RAGMultiModalModel
+# Function to load Byaldi model
 @st.cache_resource
+def load_byaldi_model():
+    model = RAGMultiModalModel.from_pretrained("vidore/colpali-v1.2", device="cpu")
+    return model
 # Function to load Qwen2-VL model
 @st.cache_resource
     img = Image.open(image)
     st.image(img, caption="Uploaded Image", use_column_width=True)
+    # OCR Extraction with Byaldi
     st.write("Extracting text from image...")
+    byaldi_model = load_byaldi_model()
+    # Create a temporary index for the uploaded image
+    with st.spinner("Processing image..."):
+        byaldi_model.index(img, index_name="temp_index", overwrite=True)
+    # Perform a dummy search to get the OCR results
+    ocr_results = byaldi_model.search("Extract all text from the image", k=1)
+    # Extract the OCR text from the results
+    if ocr_results:
+        extracted_text = ocr_results[0].metadata.get("ocr_text", "No text extracted")
+    else:
+        extracted_text = "No text extracted"
     st.write("Extracted Text:")
     st.write(extracted_text)
+    # Clear Byaldi model from memory
+    del byaldi_model
     clear_memory()
     # Text input field for question