Spaces:

Shak33l-UiRev
/

Ui-Rev-Doc-Model

Running

App Files Files Community

Shak33l-UiRev commited on 11 days ago

Commit

9ce6b31

•

1 Parent(s): 5a29686

getting confused on path

Browse files

Files changed (1) hide show

app.py +35 -104

app.py CHANGED Viewed

@@ -8,7 +8,8 @@ from transformers import (
     LayoutLMv3Processor,
     LayoutLMv3ForSequenceClassification,
     AutoProcessor,
-    AutoModelForCausalLM
 )
 from ultralytics import YOLO
 import io
@@ -27,76 +28,35 @@ logger = logging.getLogger(__name__)
 @st.cache_resource
 def load_model(model_name):
-    """Load the selected model and processor
-    Args:
-        model_name (str): Name of the model to load ("Donut", "LayoutLMv3", or "OmniParser")
-    Returns:
-        dict: Dictionary containing model components
-    """
     try:
         if model_name == "OmniParser":
             try:
-                # First try loading from HuggingFace Hub with correct repository structure
-                yolo_model = YOLO("microsoft/OmniParser/icon_detect")  # Updated path
                 processor = AutoProcessor.from_pretrained(
-                    "microsoft/OmniParser/icon_caption_florence",  # Updated path
                     trust_remote_code=True
                 )
-                caption_model = AutoModelForCausalLM.from_pretrained(
-                    "microsoft/OmniParser/icon_caption_florence",  # Updated path
                     trust_remote_code=True,
                     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
                 )
                 if torch.cuda.is_available():
-                    caption_model = caption_model.to("cuda")
-                st.success("Successfully loaded OmniParser models")
                 return {
-                    'yolo': yolo_model,
                     'processor': processor,
-                    'model': caption_model
                 }
             except Exception as e:
                 st.error(f"Failed to load OmniParser from HuggingFace Hub: {str(e)}")
-                # Try loading from local weights if available
-                weights_path = "weights"
-                if os.path.exists(os.path.join(weights_path, "icon_detect/model.safetensors")):
-                    st.info("Attempting to load from local weights...")
-                    yolo_model = YOLO(os.path.join(weights_path, "icon_detect/model.safetensors"))
-                    processor = AutoProcessor.from_pretrained(
-                        os.path.join(weights_path, "icon_caption_florence"),
-                        trust_remote_code=True,
-                        local_files_only=True
-                    )
-                    caption_model = AutoModelForCausalLM.from_pretrained(
-                        os.path.join(weights_path, "icon_caption_florence"),
-                        trust_remote_code=True,
-                        local_files_only=True,
-                        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-                    )
-                    if torch.cuda.is_available():
-                        caption_model = caption_model.to("cuda")
-                    st.success("Successfully loaded OmniParser from local weights")
-                    return {
-                        'yolo': yolo_model,
-                        'processor': processor,
-                        'model': caption_model
-                    }
-                else:
-                    st.error("Could not find local weights and HuggingFace Hub loading failed")
-                    raise ValueError("No valid model weights found for OmniParser")
         elif model_name == "Donut":
             processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
@@ -132,61 +92,32 @@ def analyze_document(image, model_name, models_dict):
             return {"error": "Model failed to load", "type": "model_error"}
         if model_name == "OmniParser":
-            # Configure detection parameters
-            box_threshold = 0.05  # Confidence threshold for detection
-            iou_threshold = 0.1   # IoU threshold for NMS
-            # Save image temporarily for YOLO processing
-            temp_path = "temp_image.png"
-            image.save(temp_path)
-            try:
-                # Run YOLO detection
-                yolo_results = models_dict['yolo'](
-                    temp_path,
-                    conf=box_threshold,
-                    iou=iou_threshold
-                )
-                # Process detections and generate captions
-                results = []
-                for det in yolo_results[0].boxes.data:
-                    x1, y1, x2, y2, conf, cls = det
-                    # Get region of interest
-                    roi = image.crop((int(x1), int(y1), int(x2), int(y2)))
-                    # Generate caption using the model
-                    inputs = models_dict['processor'](
-                        images=roi,
-                        return_tensors="pt"
-                    )
-                    outputs = models_dict['model'].generate(
-                        **inputs,
-                        max_length=50,
-                        num_beams=4,
-                        temperature=0.7
-                    )
-                    caption = models_dict['processor'].decode(outputs[0], skip_special_tokens=True)
-                    results.append({
-                        "bbox": [float(x) for x in [x1, y1, x2, y2]],
-                        "confidence": float(conf),
-                        "class": int(cls),
-                        "caption": caption
-                    })
-                return {
-                    "detected_elements": len(results),
-                    "elements": results
                 }
-            finally:
-                # Clean up temporary file
-                if os.path.exists(temp_path):
-                    os.remove(temp_path)
         elif model_name == "Donut":
             model = models_dict['model']

     LayoutLMv3Processor,
     LayoutLMv3ForSequenceClassification,
     AutoProcessor,
+    AutoModelForCausalLM,
+    AutoModelForVisualQuestionAnswering
 )
 from ultralytics import YOLO
 import io
 @st.cache_resource
 def load_model(model_name):
+    """Load the selected model and processor"""
     try:
         if model_name == "OmniParser":
             try:
+                # Load model directly using official implementation
                 processor = AutoProcessor.from_pretrained(
+                    "microsoft/OmniParser",
                     trust_remote_code=True
                 )
+                model = AutoModelForVisualQuestionAnswering.from_pretrained(
+                    "microsoft/OmniParser",
                     trust_remote_code=True,
                     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
                 )
                 if torch.cuda.is_available():
+                    model = model.to("cuda")
+                st.success("Successfully loaded OmniParser model")
                 return {
                     'processor': processor,
+                    'model': model
                 }
             except Exception as e:
                 st.error(f"Failed to load OmniParser from HuggingFace Hub: {str(e)}")
+                logger.error(f"OmniParser loading error: {str(e)}", exc_info=True)
+                return None
         elif model_name == "Donut":
             processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
             return {"error": "Model failed to load", "type": "model_error"}
         if model_name == "OmniParser":
+            # Process image with OmniParser
+            inputs = models_dict['processor'](
+                images=image,
+                return_tensors="pt",
+            )
+            if torch.cuda.is_available():
+                inputs = {k: v.to("cuda") if hasattr(v, "to") else v
+                         for k, v in inputs.items()}
+            # Generate outputs
+            outputs = models_dict['model'](**inputs)
+            # Process results
+            # The exact processing will depend on the model's output format
+            results = {
+                "predictions": outputs.logits.softmax(-1).tolist(),
+                "detected_elements": len(outputs.logits[0]),
+                "model_output": {
+                    k: v.tolist() if hasattr(v, "tolist") else str(v)
+                    for k, v in outputs.items()
+                    if k != "last_hidden_state"  # Skip large tensors
                 }
+            }
+            return results
         elif model_name == "Donut":
             model = models_dict['model']