SkalskiP commited on
Commit
b32b0a3
1 Parent(s): c803a91

masking API

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Florence2 + SAM2
3
- emoji: 🔥
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
 
1
  ---
2
+ title: Florence2 + SAM2 Masking
3
+ emoji: 😷
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
app.py CHANGED
@@ -1,62 +1,14 @@
1
- import os
2
- from typing import Tuple, Optional
3
 
4
- import cv2
5
  import gradio as gr
6
- import numpy as np
7
  import spaces
8
  import supervision as sv
9
  import torch
10
  from PIL import Image
11
- from tqdm import tqdm
12
- from utils.video import generate_unique_name, create_directory, delete_directory
13
 
14
  from utils.florence import load_florence_model, run_florence_inference, \
15
- FLORENCE_DETAILED_CAPTION_TASK, \
16
- FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
17
- from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
18
- IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
19
- from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
20
-
21
- MARKDOWN = """
22
- # Florence2 + SAM2 🔥
23
-
24
- <div>
25
- <a href="https://github.com/facebookresearch/segment-anything-2">
26
- <img src="https://badges.aleen42.com/src/github.svg" alt="GitHub" style="display:inline-block;">
27
- </a>
28
- <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-segment-images-with-sam-2.ipynb">
29
- <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
30
- </a>
31
- <a href="https://blog.roboflow.com/what-is-segment-anything-2/">
32
- <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
33
- </a>
34
- <a href="https://www.youtube.com/watch?v=Dv003fTyO-Y">
35
- <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
36
- </a>
37
- </div>
38
-
39
- This demo integrates Florence2 and SAM2 by creating a two-stage inference pipeline. In
40
- the first stage, Florence2 performs tasks such as object detection, open-vocabulary
41
- object detection, image captioning, or phrase grounding. In the second stage, SAM2
42
- performs object segmentation on the image.
43
- """
44
-
45
- IMAGE_PROCESSING_EXAMPLES = [
46
- [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw, white napkin, black napkin, hair'],
47
- [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
48
- [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
49
- [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
50
- ]
51
- VIDEO_PROCESSING_EXAMPLES = [
52
- ["videos/clip-07-camera-1.mp4", "player in white outfit, player in black outfit, ball, rim"],
53
- ["videos/clip-07-camera-2.mp4", "player in white outfit, player in black outfit, ball, rim"],
54
- ["videos/clip-07-camera-3.mp4", "player in white outfit, player in black outfit, ball, rim"]
55
- ]
56
-
57
- VIDEO_SCALE_FACTOR = 0.5
58
- VIDEO_TARGET_DIRECTORY = "tmp"
59
- create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
60
 
61
  DEVICE = torch.device("cuda")
62
  # DEVICE = torch.device("cpu")
@@ -69,119 +21,21 @@ if torch.cuda.get_device_properties(0).major >= 8:
69
 
70
  FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
71
  SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
72
- SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)
73
- COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
74
- COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
75
- BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
76
- LABEL_ANNOTATOR = sv.LabelAnnotator(
77
- color=COLOR_PALETTE,
78
- color_lookup=sv.ColorLookup.INDEX,
79
- text_position=sv.Position.CENTER_OF_MASS,
80
- text_color=sv.Color.from_hex("#000000"),
81
- border_radius=5
82
- )
83
- MASK_ANNOTATOR = sv.MaskAnnotator(
84
- color=COLOR_PALETTE,
85
- color_lookup=sv.ColorLookup.INDEX
86
- )
87
-
88
-
89
- def annotate_image(image, detections):
90
- output_image = image.copy()
91
- output_image = MASK_ANNOTATOR.annotate(output_image, detections)
92
- output_image = BOX_ANNOTATOR.annotate(output_image, detections)
93
- output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
94
- return output_image
95
-
96
-
97
- def on_mode_dropdown_change(text):
98
- return [
99
- gr.Textbox(visible=text == IMAGE_OPEN_VOCABULARY_DETECTION_MODE),
100
- gr.Textbox(visible=text == IMAGE_CAPTION_GROUNDING_MASKS_MODE),
101
- ]
102
 
103
 
104
  @spaces.GPU
105
  @torch.inference_mode()
106
  @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
107
  def process_image(
108
- mode_dropdown, image_input, text_input
109
- ) -> Tuple[Optional[Image.Image], Optional[str]]:
110
  if not image_input:
111
  gr.Info("Please upload an image.")
112
- return None, None
113
-
114
- if mode_dropdown == IMAGE_OPEN_VOCABULARY_DETECTION_MODE:
115
- if not text_input:
116
- gr.Info("Please enter a text prompt.")
117
- return None, None
118
-
119
- texts = [prompt.strip() for prompt in text_input.split(",")]
120
- detections_list = []
121
- for text in texts:
122
- _, result = run_florence_inference(
123
- model=FLORENCE_MODEL,
124
- processor=FLORENCE_PROCESSOR,
125
- device=DEVICE,
126
- image=image_input,
127
- task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
128
- text=text
129
- )
130
- detections = sv.Detections.from_lmm(
131
- lmm=sv.LMM.FLORENCE_2,
132
- result=result,
133
- resolution_wh=image_input.size
134
- )
135
- detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
136
- detections_list.append(detections)
137
-
138
- detections = sv.Detections.merge(detections_list)
139
- detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
140
- return annotate_image(image_input, detections), None
141
-
142
- if mode_dropdown == IMAGE_CAPTION_GROUNDING_MASKS_MODE:
143
- _, result = run_florence_inference(
144
- model=FLORENCE_MODEL,
145
- processor=FLORENCE_PROCESSOR,
146
- device=DEVICE,
147
- image=image_input,
148
- task=FLORENCE_DETAILED_CAPTION_TASK
149
- )
150
- caption = result[FLORENCE_DETAILED_CAPTION_TASK]
151
- _, result = run_florence_inference(
152
- model=FLORENCE_MODEL,
153
- processor=FLORENCE_PROCESSOR,
154
- device=DEVICE,
155
- image=image_input,
156
- task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
157
- text=caption
158
- )
159
- detections = sv.Detections.from_lmm(
160
- lmm=sv.LMM.FLORENCE_2,
161
- result=result,
162
- resolution_wh=image_input.size
163
- )
164
- detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
165
- return annotate_image(image_input, detections), caption
166
-
167
-
168
- @spaces.GPU(duration=300)
169
- @torch.inference_mode()
170
- @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
171
- def process_video(
172
- video_input, text_input, progress=gr.Progress(track_tqdm=True)
173
- ) -> Optional[str]:
174
- if not video_input:
175
- gr.Info("Please upload a video.")
176
- return None
177
 
178
  if not text_input:
179
  gr.Info("Please enter a text prompt.")
180
- return None
181
-
182
- frame_generator = sv.get_video_frames_generator(video_input)
183
- frame = next(frame_generator)
184
- frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
185
 
186
  texts = [prompt.strip() for prompt in text_input.split(",")]
187
  detections_list = []
@@ -190,208 +44,59 @@ def process_video(
190
  model=FLORENCE_MODEL,
191
  processor=FLORENCE_PROCESSOR,
192
  device=DEVICE,
193
- image=frame,
194
  task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
195
  text=text
196
  )
197
  detections = sv.Detections.from_lmm(
198
  lmm=sv.LMM.FLORENCE_2,
199
  result=result,
200
- resolution_wh=frame.size
201
  )
202
- detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
203
  detections_list.append(detections)
204
 
205
  detections = sv.Detections.merge(detections_list)
206
- detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
207
-
208
- if len(detections.mask) == 0:
209
- gr.Info(
210
- "No objects of class {text_input} found in the first frame of the video. "
211
- "Trim the video to make the object appear in the first frame or try a "
212
- "different text prompt."
213
- )
214
- return None
215
-
216
- name = generate_unique_name()
217
- frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
218
- frames_sink = sv.ImageSink(
219
- target_dir_path=frame_directory_path,
220
- image_name_pattern="{:05d}.jpeg"
221
- )
222
-
223
- video_info = sv.VideoInfo.from_video_path(video_input)
224
- video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)
225
- video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)
226
-
227
- frames_generator = sv.get_video_frames_generator(video_input)
228
- with frames_sink:
229
- for frame in tqdm(
230
- frames_generator,
231
- total=video_info.total_frames,
232
- desc="splitting video into frames"
233
- ):
234
- frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
235
- frames_sink.save_image(frame)
236
-
237
- inference_state = SAM_VIDEO_MODEL.init_state(
238
- video_path=frame_directory_path,
239
- device=DEVICE
240
- )
241
-
242
- for mask_index, mask in enumerate(detections.mask):
243
- _, object_ids, mask_logits = SAM_VIDEO_MODEL.add_new_mask(
244
- inference_state=inference_state,
245
- frame_idx=0,
246
- obj_id=mask_index,
247
- mask=mask
248
- )
249
-
250
- video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
251
- frames_generator = sv.get_video_frames_generator(video_input)
252
- masks_generator = SAM_VIDEO_MODEL.propagate_in_video(inference_state)
253
- with sv.VideoSink(video_path, video_info=video_info) as sink:
254
- for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
255
- frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
256
- masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
257
- if len(masks.shape) == 4:
258
- masks = np.squeeze(masks, axis=1)
259
-
260
- detections = sv.Detections(
261
- xyxy=sv.mask_to_xyxy(masks=masks),
262
- mask=masks,
263
- class_id=np.array(tracker_ids)
264
- )
265
- annotated_frame = frame.copy()
266
- annotated_frame = MASK_ANNOTATOR.annotate(
267
- scene=annotated_frame, detections=detections)
268
- annotated_frame = BOX_ANNOTATOR.annotate(
269
- scene=annotated_frame, detections=detections)
270
- sink.write_frame(annotated_frame)
271
-
272
- delete_directory(frame_directory_path)
273
- return video_path
274
 
275
 
276
  with gr.Blocks() as demo:
277
- gr.Markdown(MARKDOWN)
278
- with gr.Tab("Image"):
279
- image_processing_mode_dropdown_component = gr.Dropdown(
280
- choices=IMAGE_INFERENCE_MODES,
281
- value=IMAGE_INFERENCE_MODES[0],
282
- label="Mode",
283
- info="Select a mode to use.",
284
- interactive=True
285
- )
286
- with gr.Row():
287
- with gr.Column():
288
- image_processing_image_input_component = gr.Image(
289
- type='pil', label='Upload image')
290
- image_processing_text_input_component = gr.Textbox(
291
- label='Text prompt',
292
- placeholder='Enter comma separated text prompts')
293
- image_processing_submit_button_component = gr.Button(
294
- value='Submit', variant='primary')
295
- with gr.Column():
296
- image_processing_image_output_component = gr.Image(
297
- type='pil', label='Image output')
298
- image_processing_text_output_component = gr.Textbox(
299
- label='Caption output', visible=False)
300
-
301
- with gr.Row():
302
- gr.Examples(
303
- fn=process_image,
304
- examples=IMAGE_PROCESSING_EXAMPLES,
305
- inputs=[
306
- image_processing_mode_dropdown_component,
307
- image_processing_image_input_component,
308
- image_processing_text_input_component
309
- ],
310
- outputs=[
311
- image_processing_image_output_component,
312
- image_processing_text_output_component
313
- ],
314
- run_on_click=True
315
- )
316
- with gr.Tab("Video"):
317
- video_processing_mode_dropdown_component = gr.Dropdown(
318
- choices=VIDEO_INFERENCE_MODES,
319
- value=VIDEO_INFERENCE_MODES[0],
320
- label="Mode",
321
- info="Select a mode to use.",
322
- interactive=True
323
- )
324
- with gr.Row():
325
- with gr.Column():
326
- video_processing_video_input_component = gr.Video(
327
- label='Upload video')
328
- video_processing_text_input_component = gr.Textbox(
329
- label='Text prompt',
330
- placeholder='Enter comma separated text prompts')
331
- video_processing_submit_button_component = gr.Button(
332
- value='Submit', variant='primary')
333
- with gr.Column():
334
- video_processing_video_output_component = gr.Video(
335
- label='Video output')
336
- with gr.Row():
337
- gr.Examples(
338
- fn=process_video,
339
- examples=VIDEO_PROCESSING_EXAMPLES,
340
- inputs=[
341
- video_processing_video_input_component,
342
- video_processing_text_input_component
343
- ],
344
- outputs=video_processing_video_output_component,
345
- run_on_click=True
346
- )
347
-
348
- image_processing_submit_button_component.click(
349
  fn=process_image,
350
  inputs=[
351
- image_processing_mode_dropdown_component,
352
- image_processing_image_input_component,
353
- image_processing_text_input_component
354
  ],
355
  outputs=[
356
- image_processing_image_output_component,
357
- image_processing_text_output_component
358
  ]
359
  )
360
- image_processing_text_input_component.submit(
361
  fn=process_image,
362
  inputs=[
363
- image_processing_mode_dropdown_component,
364
- image_processing_image_input_component,
365
- image_processing_text_input_component
366
  ],
367
  outputs=[
368
- image_processing_image_output_component,
369
- image_processing_text_output_component
370
  ]
371
  )
372
- image_processing_mode_dropdown_component.change(
373
- on_mode_dropdown_change,
374
- inputs=[image_processing_mode_dropdown_component],
375
- outputs=[
376
- image_processing_text_input_component,
377
- image_processing_text_output_component
378
- ]
379
- )
380
- video_processing_submit_button_component.click(
381
- fn=process_video,
382
- inputs=[
383
- video_processing_video_input_component,
384
- video_processing_text_input_component
385
- ],
386
- outputs=video_processing_video_output_component
387
- )
388
- video_processing_text_input_component.submit(
389
- fn=process_video,
390
- inputs=[
391
- video_processing_video_input_component,
392
- video_processing_text_input_component
393
- ],
394
- outputs=video_processing_video_output_component
395
- )
396
 
397
  demo.launch(debug=False, show_error=True)
 
1
+ from typing import List
 
2
 
 
3
  import gradio as gr
 
4
  import spaces
5
  import supervision as sv
6
  import torch
7
  from PIL import Image
 
 
8
 
9
  from utils.florence import load_florence_model, run_florence_inference, \
10
+ FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
11
+ from utils.sam import load_sam_image_model, run_sam_inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  DEVICE = torch.device("cuda")
14
  # DEVICE = torch.device("cpu")
 
21
 
22
  FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
23
  SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  @spaces.GPU
27
  @torch.inference_mode()
28
  @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
29
  def process_image(
30
+ image_input, text_input
31
+ ) -> List[Image]:
32
  if not image_input:
33
  gr.Info("Please upload an image.")
34
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  if not text_input:
37
  gr.Info("Please enter a text prompt.")
38
+ return []
 
 
 
 
39
 
40
  texts = [prompt.strip() for prompt in text_input.split(",")]
41
  detections_list = []
 
44
  model=FLORENCE_MODEL,
45
  processor=FLORENCE_PROCESSOR,
46
  device=DEVICE,
47
+ image=image_input,
48
  task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
49
  text=text
50
  )
51
  detections = sv.Detections.from_lmm(
52
  lmm=sv.LMM.FLORENCE_2,
53
  result=result,
54
+ resolution_wh=image_input.size
55
  )
56
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
57
  detections_list.append(detections)
58
 
59
  detections = sv.Detections.merge(detections_list)
60
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
61
+ return [
62
+ Image.fromarray(mask.astype("uint8") * 255)
63
+ for mask
64
+ in detections.mask
65
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  with gr.Blocks() as demo:
69
+ with gr.Row():
70
+ with gr.Column():
71
+ image_input_component = gr.Image(
72
+ type='pil', label='Upload image')
73
+ text_input_component = gr.Textbox(
74
+ label='Text prompt',
75
+ placeholder='Enter comma separated text prompts')
76
+ submit_button_component = gr.Button(
77
+ value='Submit', variant='primary')
78
+ with gr.Column():
79
+ gallery_output_component = gr.Gallery(label='Output masks')
80
+
81
+ submit_button_component.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  fn=process_image,
83
  inputs=[
84
+ image_input_component,
85
+ text_input_component
 
86
  ],
87
  outputs=[
88
+ gallery_output_component,
 
89
  ]
90
  )
91
+ text_input_component.submit(
92
  fn=process_image,
93
  inputs=[
94
+ image_input_component,
95
+ text_input_component
 
96
  ],
97
  outputs=[
98
+ gallery_output_component,
 
99
  ]
100
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  demo.launch(debug=False, show_error=True)
utils/florence.py CHANGED
@@ -7,7 +7,8 @@ from PIL import Image
7
  from transformers import AutoModelForCausalLM, AutoProcessor
8
  from transformers.dynamic_module_utils import get_imports
9
 
10
- FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
 
11
  FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
12
  FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
13
  FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
 
7
  from transformers import AutoModelForCausalLM, AutoProcessor
8
  from transformers.dynamic_module_utils import get_imports
9
 
10
+ # FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
11
+ FLORENCE_CHECKPOINT = "microsoft/Florence-2-large"
12
  FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
13
  FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
14
  FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
utils/modes.py DELETED
@@ -1,13 +0,0 @@
1
- IMAGE_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + image masks"
2
- IMAGE_CAPTION_GROUNDING_MASKS_MODE = "caption + grounding + image masks"
3
-
4
- IMAGE_INFERENCE_MODES = [
5
- IMAGE_OPEN_VOCABULARY_DETECTION_MODE,
6
- IMAGE_CAPTION_GROUNDING_MASKS_MODE
7
- ]
8
-
9
- VIDEO_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + video masks"
10
-
11
- VIDEO_INFERENCE_MODES = [
12
- VIDEO_OPEN_VOCABULARY_DETECTION_MODE
13
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/sam.py CHANGED
@@ -7,8 +7,10 @@ from PIL import Image
7
  from sam2.build_sam import build_sam2, build_sam2_video_predictor
8
  from sam2.sam2_image_predictor import SAM2ImagePredictor
9
 
10
- SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
11
- SAM_CONFIG = "sam2_hiera_s.yaml"
 
 
12
 
13
 
14
  def load_sam_image_model(
 
7
  from sam2.build_sam import build_sam2, build_sam2_video_predictor
8
  from sam2.sam2_image_predictor import SAM2ImagePredictor
9
 
10
+ # SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
11
+ # SAM_CONFIG = "sam2_hiera_s.yaml"
12
+ SAM_CHECKPOINT = "checkpoints/sam2_hiera_large.pt"
13
+ SAM_CONFIG = "sam2_hiera_l.yaml"
14
 
15
 
16
  def load_sam_image_model(
utils/video.py DELETED
@@ -1,26 +0,0 @@
1
- import datetime
2
- import os
3
- import shutil
4
- import uuid
5
-
6
-
7
- def create_directory(directory_path: str) -> None:
8
- if not os.path.exists(directory_path):
9
- os.makedirs(directory_path)
10
-
11
-
12
- def delete_directory(directory_path: str) -> None:
13
- if not os.path.exists(directory_path):
14
- raise FileNotFoundError(f"Directory '{directory_path}' does not exist.")
15
-
16
- try:
17
- shutil.rmtree(directory_path)
18
- except PermissionError:
19
- raise PermissionError(
20
- f"Permission denied: Unable to delete '{directory_path}'.")
21
-
22
-
23
- def generate_unique_name():
24
- current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
25
- unique_id = uuid.uuid4()
26
- return f"{current_datetime}_{unique_id}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
videos/clip-07-camera-1.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7944c1a5e9be241ebf11eb39f6302c3ce9d8482ca9f12e4268b252aeda6baee9
3
- size 5500081
 
 
 
 
videos/clip-07-camera-2.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:abbfef6d422c9aa3968d14de6b78aecaf544c85423d401387e3d5e75ffee3497
3
- size 5467189
 
 
 
 
videos/clip-07-camera-3.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e518f2ee6761d559bc864be2fec70ddc41244fbf3fea404c3158129a434ce879
3
- size 5397505