#%cd /content/florence-sam import os from typing import Tuple, Optional import shutil import os import cv2 import numpy as np import spaces import supervision as sv import torch from PIL import Image from tqdm import tqdm import sys import json import pickle os.chdir("/content/florence-sam") sys.path.append('/content/florence-sam') from utils.video import generate_unique_name, create_directory, delete_directory from utils.florence import load_florence_model, run_florence_inference, \ FLORENCE_DETAILED_CAPTION_TASK, \ FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \ IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model DEVICE = torch.device("cuda") DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][-1] DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][0] torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__() if torch.cuda.get_device_properties(0).major >= 8: torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE) SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE) with open('/content/texts.pkl', 'rb') as file: texts = pickle.load(file) print(texts) with open('/content/output_video.pkl', 'rb') as file: output_video = pickle.load(file) print(output_video) VIDEO_SCALE_FACTOR = 1 VIDEO_TARGET_DIRECTORY = "/content/" create_directory(directory_path=VIDEO_TARGET_DIRECTORY) video_input= output_video texts = ['the table', 'men','ball'] #VIDEO_TARGET_DIRECTORY = "/content/" if not video_input: print("Please upload a video.") frame_generator = sv.get_video_frames_generator(video_input) frame = next(frame_generator) frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) detections_list = [] width, height = frame.size all_ok_bboxes = [] half_area = width * height * 0.5 # 存储所有 the table 的边界框和面积 table_bboxes = [] table_areas = [] given_area =1000 ok_result =[] for text in texts: _, result = run_florence_inference( model=FLORENCE_MODEL, processor=FLORENCE_PROCESSOR, device=DEVICE, image=frame, task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK, text=text ) #print(result) for bbox, label in zip(result['']['bboxes'], result['']['bboxes_labels']): print(bbox, label) new_result = {'': {'bboxes': [bbox], 'bboxes_labels': [label], 'polygons': [], 'polygons_labels': []}} print(new_result) if label == 'ping pong ball': # 计算当前 ping pong ball 的面积 area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) # 检查面积是否不超过给定边界框的面积 if area <= given_area: all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]]) ok_result.append(new_result) elif label == 'the table': # 计算当前 the table 的面积 print('the tablethe table!!!!') area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) table_bboxes.append([[bbox[0] - 100, bbox[1]], [bbox[2] + 100, bbox[3]]]) table_areas.append(area) elif label == 'table tennis bat': all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]]) ok_result.append(new_result) elif label == 'men': print('menmne!!!!') all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]]) ok_result.append(new_result) # 找到面积最大的 the table if table_areas: max_area_index = table_areas.index(max(table_areas)) max_area_bbox = table_bboxes[max_area_index] # 检查面积是否超过50% if max(table_areas) < half_area: all_ok_bboxes.append(max_area_bbox) ok_result.append(new_result) print(ok_result) with open('/content/all_ok_bboxes.pkl', 'wb') as file: pickle.dump(all_ok_bboxes, file) for xyxy in ok_result: print(frame.size,xyxy) detections = sv.Detections.from_lmm( lmm=sv.LMM.FLORENCE_2, result=xyxy, resolution_wh=frame.size ) detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections) print(detections) detections_list.append(detections) with open('/content/detections_list.pkl', 'wb') as file: pickle.dump(detections_list, file) print(detections_list)