Shak33l-UiRev commited on
Commit
9ce6b31
1 Parent(s): 5a29686

getting confused on path

Browse files
Files changed (1) hide show
  1. app.py +35 -104
app.py CHANGED
@@ -8,7 +8,8 @@ from transformers import (
8
  LayoutLMv3Processor,
9
  LayoutLMv3ForSequenceClassification,
10
  AutoProcessor,
11
- AutoModelForCausalLM
 
12
  )
13
  from ultralytics import YOLO
14
  import io
@@ -27,76 +28,35 @@ logger = logging.getLogger(__name__)
27
 
28
  @st.cache_resource
29
  def load_model(model_name):
30
- """Load the selected model and processor
31
-
32
- Args:
33
- model_name (str): Name of the model to load ("Donut", "LayoutLMv3", or "OmniParser")
34
-
35
- Returns:
36
- dict: Dictionary containing model components
37
- """
38
  try:
39
  if model_name == "OmniParser":
40
  try:
41
- # First try loading from HuggingFace Hub with correct repository structure
42
- yolo_model = YOLO("microsoft/OmniParser/icon_detect") # Updated path
43
-
44
  processor = AutoProcessor.from_pretrained(
45
- "microsoft/OmniParser/icon_caption_florence", # Updated path
46
  trust_remote_code=True
47
  )
48
 
49
- caption_model = AutoModelForCausalLM.from_pretrained(
50
- "microsoft/OmniParser/icon_caption_florence", # Updated path
51
  trust_remote_code=True,
52
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
53
  )
54
 
55
  if torch.cuda.is_available():
56
- caption_model = caption_model.to("cuda")
57
 
58
- st.success("Successfully loaded OmniParser models")
59
  return {
60
- 'yolo': yolo_model,
61
  'processor': processor,
62
- 'model': caption_model
63
  }
64
 
65
  except Exception as e:
66
  st.error(f"Failed to load OmniParser from HuggingFace Hub: {str(e)}")
67
-
68
- # Try loading from local weights if available
69
- weights_path = "weights"
70
- if os.path.exists(os.path.join(weights_path, "icon_detect/model.safetensors")):
71
- st.info("Attempting to load from local weights...")
72
-
73
- yolo_model = YOLO(os.path.join(weights_path, "icon_detect/model.safetensors"))
74
-
75
- processor = AutoProcessor.from_pretrained(
76
- os.path.join(weights_path, "icon_caption_florence"),
77
- trust_remote_code=True,
78
- local_files_only=True
79
- )
80
-
81
- caption_model = AutoModelForCausalLM.from_pretrained(
82
- os.path.join(weights_path, "icon_caption_florence"),
83
- trust_remote_code=True,
84
- local_files_only=True,
85
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
86
- )
87
-
88
- if torch.cuda.is_available():
89
- caption_model = caption_model.to("cuda")
90
-
91
- st.success("Successfully loaded OmniParser from local weights")
92
- return {
93
- 'yolo': yolo_model,
94
- 'processor': processor,
95
- 'model': caption_model
96
- }
97
- else:
98
- st.error("Could not find local weights and HuggingFace Hub loading failed")
99
- raise ValueError("No valid model weights found for OmniParser")
100
 
101
  elif model_name == "Donut":
102
  processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
@@ -132,61 +92,32 @@ def analyze_document(image, model_name, models_dict):
132
  return {"error": "Model failed to load", "type": "model_error"}
133
 
134
  if model_name == "OmniParser":
135
- # Configure detection parameters
136
- box_threshold = 0.05 # Confidence threshold for detection
137
- iou_threshold = 0.1 # IoU threshold for NMS
 
 
138
 
139
- # Save image temporarily for YOLO processing
140
- temp_path = "temp_image.png"
141
- image.save(temp_path)
142
 
143
- try:
144
- # Run YOLO detection
145
- yolo_results = models_dict['yolo'](
146
- temp_path,
147
- conf=box_threshold,
148
- iou=iou_threshold
149
- )
150
-
151
- # Process detections and generate captions
152
- results = []
153
- for det in yolo_results[0].boxes.data:
154
- x1, y1, x2, y2, conf, cls = det
155
-
156
- # Get region of interest
157
- roi = image.crop((int(x1), int(y1), int(x2), int(y2)))
158
-
159
- # Generate caption using the model
160
- inputs = models_dict['processor'](
161
- images=roi,
162
- return_tensors="pt"
163
- )
164
-
165
- outputs = models_dict['model'].generate(
166
- **inputs,
167
- max_length=50,
168
- num_beams=4,
169
- temperature=0.7
170
- )
171
-
172
- caption = models_dict['processor'].decode(outputs[0], skip_special_tokens=True)
173
-
174
- results.append({
175
- "bbox": [float(x) for x in [x1, y1, x2, y2]],
176
- "confidence": float(conf),
177
- "class": int(cls),
178
- "caption": caption
179
- })
180
-
181
- return {
182
- "detected_elements": len(results),
183
- "elements": results
184
  }
185
-
186
- finally:
187
- # Clean up temporary file
188
- if os.path.exists(temp_path):
189
- os.remove(temp_path)
190
 
191
  elif model_name == "Donut":
192
  model = models_dict['model']
 
8
  LayoutLMv3Processor,
9
  LayoutLMv3ForSequenceClassification,
10
  AutoProcessor,
11
+ AutoModelForCausalLM,
12
+ AutoModelForVisualQuestionAnswering
13
  )
14
  from ultralytics import YOLO
15
  import io
 
28
 
29
  @st.cache_resource
30
  def load_model(model_name):
31
+ """Load the selected model and processor"""
 
 
 
 
 
 
 
32
  try:
33
  if model_name == "OmniParser":
34
  try:
35
+ # Load model directly using official implementation
 
 
36
  processor = AutoProcessor.from_pretrained(
37
+ "microsoft/OmniParser",
38
  trust_remote_code=True
39
  )
40
 
41
+ model = AutoModelForVisualQuestionAnswering.from_pretrained(
42
+ "microsoft/OmniParser",
43
  trust_remote_code=True,
44
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
45
  )
46
 
47
  if torch.cuda.is_available():
48
+ model = model.to("cuda")
49
 
50
+ st.success("Successfully loaded OmniParser model")
51
  return {
 
52
  'processor': processor,
53
+ 'model': model
54
  }
55
 
56
  except Exception as e:
57
  st.error(f"Failed to load OmniParser from HuggingFace Hub: {str(e)}")
58
+ logger.error(f"OmniParser loading error: {str(e)}", exc_info=True)
59
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  elif model_name == "Donut":
62
  processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
 
92
  return {"error": "Model failed to load", "type": "model_error"}
93
 
94
  if model_name == "OmniParser":
95
+ # Process image with OmniParser
96
+ inputs = models_dict['processor'](
97
+ images=image,
98
+ return_tensors="pt",
99
+ )
100
 
101
+ if torch.cuda.is_available():
102
+ inputs = {k: v.to("cuda") if hasattr(v, "to") else v
103
+ for k, v in inputs.items()}
104
 
105
+ # Generate outputs
106
+ outputs = models_dict['model'](**inputs)
107
+
108
+ # Process results
109
+ # The exact processing will depend on the model's output format
110
+ results = {
111
+ "predictions": outputs.logits.softmax(-1).tolist(),
112
+ "detected_elements": len(outputs.logits[0]),
113
+ "model_output": {
114
+ k: v.tolist() if hasattr(v, "tolist") else str(v)
115
+ for k, v in outputs.items()
116
+ if k != "last_hidden_state" # Skip large tensors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  }
118
+ }
119
+
120
+ return results
 
 
121
 
122
  elif model_name == "Donut":
123
  model = models_dict['model']