test

Browse files

Files changed (4) hide show

.envrc +1 -0
model-card.md +40 -27
requirements.txt +1 -0
scripts/upload_to_hub.py +35 -8

.envrc ADDED Viewed

	@@ -0,0 +1 @@


1	+ HF_USERNAME=bawolf

model-card.md CHANGED Viewed

@@ -16,43 +16,56 @@ This model is a fine-tuned version of CLIP (ViT-Large/14) specialized in classif
 ## Model Description
-- **Model Type:** Fine-tuned CLIP model
-- **Base Model:** ViT-Large/14
 - **Task:** Video Classification
 - **Training Data:** Custom break dance video dataset
-- **Output:** 3 classes of break dance moves
 ## Usage
 ```python
-from transformers import CLIPProcessor, CLIPModel
 import torch
-import cv2
 from PIL import Image
 # Load model and processor
-processor = CLIPProcessor.from_pretrained("[your-username]/clip-breakdance-classifier")
-model = CLIPModel.from_pretrained("[your-username]/clip-breakdance-classifier")
-# Load video and process frames
-video = cv2.VideoCapture("breakdance_move.mp4")
-predictions = []
-while video.isOpened():
-    ret, frame = video.read()
-    if not ret:
-        break
-    # Convert BGR to RGB and to PIL Image
-    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    frame_pil = Image.fromarray(frame_rgb)
-    # Process frame
-    inputs = processor(images=frame_pil, return_tensors="pt")
-    outputs = model(**inputs)
-    predictions.append(outputs)
-video.release()
 ```
 ## Limitations

 ## Model Description
+- **Model Type:** Custom CLIP-based architecture (VariableLengthCLIP)
+- **Base Model:** CLIP ViT-Large/14 (for feature extraction)
+- **Architecture:**
+  - Uses CLIP's vision encoder for frame-level feature extraction
+  - Processes multiple frames from a video
+  - Averages frame features
+  - Projects to 3 classes via a learned linear layer
 - **Task:** Video Classification
 - **Training Data:** Custom break dance video dataset
+- **Output:** 3 classes of break dance moves (windmill, halo, swipe)
 ## Usage
 ```python
 import torch
+from transformers import CLIPProcessor
 from PIL import Image
+import cv2
+import numpy as np
+from src.models.model import create_model
 # Load model and processor
+model = create_model(num_classes=3, pretrained_model_name="openai/clip-vit-large-patch14")
+state_dict = torch.load("model.pth")
+model.load_state_dict(state_dict)
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+# Process video
+def process_video(video_path, model, processor):
+    video = cv2.VideoCapture(video_path)
+    frames = []
+    while video.isOpened():
+        ret, frame = video.read()
+        if not ret:
+            break
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frame_pil = Image.fromarray(frame_rgb)
+        processed = processor(images=frame_pil, return_tensors="pt")
+        frames.append(processed.pixel_values)
+    video.release()
+    # Stack frames and process
+    frames_tensor = torch.cat(frames, dim=0)
+    with torch.no_grad():
+        predictions = model(frames_tensor.unsqueeze(0))
+    return predictions
 ```
 ## Limitations

requirements.txt CHANGED Viewed

@@ -9,6 +9,7 @@ cog==0.12.0
 colorlog==6.9.0
 contourpy==1.3.0
 cycler==0.12.1
 fastapi==0.110.3
 filelock==3.16.1
 fonttools==4.54.1

 colorlog==6.9.0
 contourpy==1.3.0
 cycler==0.12.1
+dotenv==1.0.1
 fastapi==0.110.3
 filelock==3.16.1
 fonttools==4.54.1

scripts/upload_to_hub.py CHANGED Viewed

@@ -1,17 +1,44 @@
-from transformers import CLIPProcessor, CLIPModel
 from huggingface_hub import HfApi
-def upload_model_to_hub():
     # Initialize huggingface api
     api = HfApi()
-    # Load your fine-tuned model
-    model = CLIPModel.from_pretrained("./checkpoints/")
     processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-    # Push to hub
-    model.push_to_hub("[your-username]/clip-breakdance-classifier")
-    processor.push_to_hub("[your-username]/clip-breakdance-classifier")
 if __name__ == "__main__":
-    upload_model_to_hub()

+from transformers import CLIPProcessor
 from huggingface_hub import HfApi
+import os
+from dotenv import load_dotenv
+import torch
+from src.models.model import create_model
+def upload_model_to_hub(hf_username):
     # Initialize huggingface api
     api = HfApi()
+    # Load your custom model
+    num_classes = 3  # windmills, halos, and swipes
+    model = create_model(num_classes, "openai/clip-vit-large-patch14")
+    # Load your trained weights
+    state_dict = torch.load("./checkpoints/model.pth", map_location="cpu")
+    model.load_state_dict(state_dict, strict=False)
+    # Get the processor from the base CLIP model
     processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    repo_id = f"{hf_username}/breaking-vision-clip-classifier"
+    # Save model configuration and architecture
+    config = {
+        "num_classes": num_classes,
+        "base_model": "openai/clip-vit-large-patch14",
+        "class_labels": ["windmill", "halo", "swipe"],
+        "model_type": "VariableLengthCLIP"
+    }
+    # Push to hub with config
+    model.push_to_hub(
+        repo_id,
+        config_dict=config,
+        commit_message="Upload custom CLIP-based dance classifier"
+    )
+    processor.push_to_hub(repo_id)
 if __name__ == "__main__":
+    load_dotenv()
+    hf_username = os.getenv("HF_USERNAME")
+    upload_model_to_hub(hf_username)