bawolf commited on
Commit
0cb12a1
1 Parent(s): c850c95
Files changed (4) hide show
  1. .envrc +1 -0
  2. model-card.md +40 -27
  3. requirements.txt +1 -0
  4. scripts/upload_to_hub.py +35 -8
.envrc ADDED
@@ -0,0 +1 @@
 
 
1
+ HF_USERNAME=bawolf
model-card.md CHANGED
@@ -16,43 +16,56 @@ This model is a fine-tuned version of CLIP (ViT-Large/14) specialized in classif
16
 
17
  ## Model Description
18
 
19
- - **Model Type:** Fine-tuned CLIP model
20
- - **Base Model:** ViT-Large/14
 
 
 
 
 
21
  - **Task:** Video Classification
22
  - **Training Data:** Custom break dance video dataset
23
- - **Output:** 3 classes of break dance moves
24
 
25
  ## Usage
26
 
27
  ```python
28
- from transformers import CLIPProcessor, CLIPModel
29
  import torch
30
- import cv2
31
  from PIL import Image
 
 
 
32
 
33
  # Load model and processor
34
- processor = CLIPProcessor.from_pretrained("[your-username]/clip-breakdance-classifier")
35
- model = CLIPModel.from_pretrained("[your-username]/clip-breakdance-classifier")
36
-
37
- # Load video and process frames
38
- video = cv2.VideoCapture("breakdance_move.mp4")
39
- predictions = []
40
-
41
- while video.isOpened():
42
- ret, frame = video.read()
43
- if not ret:
44
- break
45
-
46
- # Convert BGR to RGB and to PIL Image
47
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
48
- frame_pil = Image.fromarray(frame_rgb)
49
-
50
- # Process frame
51
- inputs = processor(images=frame_pil, return_tensors="pt")
52
- outputs = model(**inputs)
53
- predictions.append(outputs)
54
-
55
- video.release()
 
 
 
 
 
 
56
  ```
57
 
58
  ## Limitations
 
16
 
17
  ## Model Description
18
 
19
+ - **Model Type:** Custom CLIP-based architecture (VariableLengthCLIP)
20
+ - **Base Model:** CLIP ViT-Large/14 (for feature extraction)
21
+ - **Architecture:**
22
+ - Uses CLIP's vision encoder for frame-level feature extraction
23
+ - Processes multiple frames from a video
24
+ - Averages frame features
25
+ - Projects to 3 classes via a learned linear layer
26
  - **Task:** Video Classification
27
  - **Training Data:** Custom break dance video dataset
28
+ - **Output:** 3 classes of break dance moves (windmill, halo, swipe)
29
 
30
  ## Usage
31
 
32
  ```python
 
33
  import torch
34
+ from transformers import CLIPProcessor
35
  from PIL import Image
36
+ import cv2
37
+ import numpy as np
38
+ from src.models.model import create_model
39
 
40
  # Load model and processor
41
+ model = create_model(num_classes=3, pretrained_model_name="openai/clip-vit-large-patch14")
42
+ state_dict = torch.load("model.pth")
43
+ model.load_state_dict(state_dict)
44
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
45
+
46
+ # Process video
47
+ def process_video(video_path, model, processor):
48
+ video = cv2.VideoCapture(video_path)
49
+ frames = []
50
+
51
+ while video.isOpened():
52
+ ret, frame = video.read()
53
+ if not ret:
54
+ break
55
+
56
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
57
+ frame_pil = Image.fromarray(frame_rgb)
58
+ processed = processor(images=frame_pil, return_tensors="pt")
59
+ frames.append(processed.pixel_values)
60
+
61
+ video.release()
62
+
63
+ # Stack frames and process
64
+ frames_tensor = torch.cat(frames, dim=0)
65
+ with torch.no_grad():
66
+ predictions = model(frames_tensor.unsqueeze(0))
67
+
68
+ return predictions
69
  ```
70
 
71
  ## Limitations
requirements.txt CHANGED
@@ -9,6 +9,7 @@ cog==0.12.0
9
  colorlog==6.9.0
10
  contourpy==1.3.0
11
  cycler==0.12.1
 
12
  fastapi==0.110.3
13
  filelock==3.16.1
14
  fonttools==4.54.1
 
9
  colorlog==6.9.0
10
  contourpy==1.3.0
11
  cycler==0.12.1
12
+ dotenv==1.0.1
13
  fastapi==0.110.3
14
  filelock==3.16.1
15
  fonttools==4.54.1
scripts/upload_to_hub.py CHANGED
@@ -1,17 +1,44 @@
1
- from transformers import CLIPProcessor, CLIPModel
2
  from huggingface_hub import HfApi
 
 
 
 
3
 
4
- def upload_model_to_hub():
5
  # Initialize huggingface api
6
  api = HfApi()
7
 
8
- # Load your fine-tuned model
9
- model = CLIPModel.from_pretrained("./checkpoints/")
 
 
 
 
 
 
 
10
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
11
 
12
- # Push to hub
13
- model.push_to_hub("[your-username]/clip-breakdance-classifier")
14
- processor.push_to_hub("[your-username]/clip-breakdance-classifier")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  if __name__ == "__main__":
17
- upload_model_to_hub()
 
 
 
1
+ from transformers import CLIPProcessor
2
  from huggingface_hub import HfApi
3
+ import os
4
+ from dotenv import load_dotenv
5
+ import torch
6
+ from src.models.model import create_model
7
 
8
+ def upload_model_to_hub(hf_username):
9
  # Initialize huggingface api
10
  api = HfApi()
11
 
12
+ # Load your custom model
13
+ num_classes = 3 # windmills, halos, and swipes
14
+ model = create_model(num_classes, "openai/clip-vit-large-patch14")
15
+
16
+ # Load your trained weights
17
+ state_dict = torch.load("./checkpoints/model.pth", map_location="cpu")
18
+ model.load_state_dict(state_dict, strict=False)
19
+
20
+ # Get the processor from the base CLIP model
21
  processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
22
 
23
+ repo_id = f"{hf_username}/breaking-vision-clip-classifier"
24
+
25
+ # Save model configuration and architecture
26
+ config = {
27
+ "num_classes": num_classes,
28
+ "base_model": "openai/clip-vit-large-patch14",
29
+ "class_labels": ["windmill", "halo", "swipe"],
30
+ "model_type": "VariableLengthCLIP"
31
+ }
32
+
33
+ # Push to hub with config
34
+ model.push_to_hub(
35
+ repo_id,
36
+ config_dict=config,
37
+ commit_message="Upload custom CLIP-based dance classifier"
38
+ )
39
+ processor.push_to_hub(repo_id)
40
 
41
  if __name__ == "__main__":
42
+ load_dotenv()
43
+ hf_username = os.getenv("HF_USERNAME")
44
+ upload_model_to_hub(hf_username)