Spaces:

sandeshb
/

VidTune-Gradio

Running on Zero

@@ -1,65 +0,0 @@
-import requests
-import argparse
-# Parse command line arguments
-parser = argparse.ArgumentParser(description="Music Generation Client")
-parser.add_argument(
-    "--server_url", type=str, default="http://localhost:8000", help="URL of the server"
-)
-parser.add_argument(
-    "--prompts",
-    nargs="+",
-    type=str,
-    default=["Lofi Music for Coding"],
-    help="Prompts for music generation",
-)
-parser.add_argument(
-    "--output_file", type=str, default="output.wav", help="Output file name"
-)
-parser.add_argument(
-    "--duration", type=int, default=10, help="Duration of generated music in seconds"
-)
-parser.add_argument(
-    "--check_health", action='store_true', help="Check server health"
-)
-args = parser.parse_args()
-def generate_music(server_url, prompts, duration, output_file):
-    url = f"{server_url}/generate_music"
-    headers = {"Content-Type": "application/json"}
-    data = {"prompts": prompts, "duration": duration}
-    response = requests.post(url, json=data, headers=headers)
-    if response.status_code == 200:
-        with open(output_file, "wb") as f:
-            f.write(response.content)
-        print(f"Music saved to {output_file}")
-    else:
-        print(f"Failed to generate music: {response.status_code}, {response.text}")
-def check_server_health(server_url):
-    url = f"{server_url}/health"
-    response = requests.get(url)
-    if response.status_code == 200:
-        health_status = response.json()
-        print("Server Health Check:")
-        print(f"Server Running: {health_status['server_running']}")
-        print(f"Model Loaded: {health_status['model_loaded']}")
-        print(f"CPU Usage: {health_status['cpu_usage_percent']}%")
-        print(f"RAM Usage: {health_status['ram_usage_percent']}%")
-        if 'gpu_memory_allocated' in health_status:
-            gpu_memory_allocated_gb = health_status['gpu_memory_allocated'] / (1024 ** 3)
-            gpu_memory_reserved_gb = health_status['gpu_memory_reserved'] / (1024 ** 3)
-            print(f"GPU Memory Allocated: {gpu_memory_allocated_gb:.2f} GB")
-            print(f"GPU Memory Reserved: {gpu_memory_reserved_gb:.2f} GB")
-    else:
-        print(f"Failed to check server health: {response.status_code}, {response.text}")
-if __name__ == "__main__":
-    if args.check_health:
-        check_server_health(args.server_url)
-    else:
-        generate_music(args.server_url, args.prompts, args.duration, args.output_file)

def __init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .logger import logging

engine/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .video_descriptor import DescribeVideo

engine/audio_generator.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # TODO: Add from model server

engine/video_descriptor.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from warnings import simplefilter
+simplefilter("ignore")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+import json
+import time
+import google.generativeai as genai
+try:
+    from logger import logging
+except:
+    import logging
+music_prompt_examples = """
+'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
+'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
+'90s rock song with electric guitar and heavy drums'.
+"""
+json_schema = """
+{"Content Description": "string", "Music Prompt": "string"}
+"""
+gemni_instructions = f"""
+You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models. Use keywords related to genre, instruments, mood, context, and setting to craft a concise single-sentence prompt, like:
+{music_prompt_examples}
+You must return your response using this JSON schema: {json_schema}
+"""
+class DescribeVideo:
+    def __init__(self, model="flash"):
+        self.model = self.get_model_name(model)
+        __api_key = self.load_api_key()
+        self.is_safety_set = False
+        self.safety_settings = self.get_safety_settings()
+        genai.configure(api_key=__api_key)
+        self.mllm_model = genai.GenerativeModel(self.model)
+        logging.info(f"Initialized DescribeVideo with model: {self.model}")
+    def describe_video(self, video_path):
+        video_file = genai.upload_file(video_path)
+        logging.info(f"Uploaded video: {video_path}")
+        while video_file.state.name == "PROCESSING":
+            time.sleep(0.25)
+            video_file = genai.get_file(video_file.name)
+        if video_file.state.name == "FAILED":
+            logging.error(f"Failed to upload video: {video_file.state.name}")
+            raise ValueError(f"Failed to upload video: {video_file.state.name}")
+        response = self.mllm_model.generate_content(
+            [video_file, "Explain what is happening in this video"],
+            request_options={"timeout": 600},
+            safety_settings=self.safety_settings,
+        )
+        logging.info(
+            f"Generated content for video: {video_path} with response: {response.text}"
+        )
+        cleaned_response = self.mllm_model.generate_content(
+            [
+                response.text,
+                gemni_instructions,
+            ],
+            safety_settings=self.safety_settings,
+        )
+        logging.info(f"Generated : {video_path} with response: {cleaned_response.text}")
+        return json.loads(cleaned_response.text.strip("```json\n"))
+    def reset_safety_settings(self):
+        logging.info("Resetting safety settings")
+        self.is_safety_set = False
+        self.safety_settings = self.get_safety_settings()
+    def set_safety_settings(self, safety_settings):
+        self.safety_settings = safety_settings
+        # Sanity Checks
+        if not isinstance(safety_settings, dict):
+            raise ValueError("Safety settings must be a dictionary")
+        for harm_category, harm_block_threshold in safety_settings.items():
+            if harm_category not in genai.types.HarmCategory.__members__:
+                raise ValueError(f"Invalid harm category: {harm_category}")
+            if harm_block_threshold not in genai.types.HarmBlockThreshold.__members__:
+                raise ValueError(
+                    f"Invalid harm block threshold: {harm_block_threshold}"
+                )
+        logging.info(f"Set safety settings: {safety_settings}")
+        self.safety_settings = safety_settings
+        self.is_safety_set = True
+    def get_safety_settings(self):
+        default_safety_settings = {
+            genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai.types.HarmBlockThreshold.BLOCK_NONE,
+            genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+            genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+            genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
+        }
+        if self.is_safety_set:
+            return self.safety_settings
+        return default_safety_settings
+    @staticmethod
+    def load_api_key(path="./creds.json"):
+        with open(path) as f:
+            creds = json.load(f)
+        api_key = creds.get("google_api_key", None)
+        if api_key is None or not isinstance(api_key, str):
+            logging.error(f"Google API key not found in {path}")
+            raise ValueError(f"Gemini API key not found in {path}")
+        return api_key
+    @staticmethod
+    def get_model_name(model):
+        models = {
+            "flash": "models/gemini-1.5-flash-latest",
+            "pro": "models/gemini-1.5-pro-latest",
+        }
+        if model not in models:
+            logging.error(
+                f"Invalid model name '{model}'. Valid options are: {', '.join(models.keys())}"
+            )
+            raise ValueError(
+                f"Invalid model name '{model}'. Valid options are: {', '.join(models.keys())}"
+            )
+        logging.info(f"Selected model: {models[model]}")
+        return models[model]
+if __name__ == "__main__":
+    video_path = "videos/3A49B385FD4A8FE2E3AEEF43C140D9AF_video_dashinit.mp4"
+    dv = DescribeVideo(model="flash")
+    video_description = dv.describe_video(video_path)
+    print(video_description)

logger.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import logging
+FORMAT = "%(asctime)s: %(levelname)s: %(message)s"
+logging.basicConfig(filename='logs.log', level=logging.INFO, format=FORMAT)
+STDERRLOGGER = logging.StreamHandler()
+STDERRLOGGER.setFormatter(logging.Formatter(FORMAT))
+logging.getLogger().addHandler(STDERRLOGGER)

main.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import streamlit as st
+def main():
+    st.set_page_config(page_title="VidTune: Where Videos Find Their Melody", layout="centered")
+    # Title and Description
+    st.title("VidTune: Where Videos Find Their Melody")
+    st.write("VidTune is a web application that allows users to upload videos and generate melodies matching the mood of the video.")
+    # Main Page (Page 1)
+    if 'page' not in st.session_state:
+        st.session_state.page = 'main'
+    if st.session_state.page == 'main':
+        st.header("Video to Music")
+        uploaded_video = st.file_uploader("Upload Video", type=["mp4"])
+        if uploaded_video is not None:
+            st.session_state.uploaded_video = uploaded_video
+            st.session_state.page = 'video_to_music'
+        if st.session_state.page == 'main':
+            st.header("Prompt to Music")
+            prompt = st.text_area("Prompt")
+            if st.button("Generate"):
+                st.session_state.prompt = prompt
+                st.session_state.page = 'prompt_to_music'
+    # Page 2a (If the user uploads a video)
+    if st.session_state.page == 'video_to_music':
+        st.sidebar.title("Settings")
+        device = st.sidebar.selectbox("Select Device", ["GPU", "CPU"], index=0)
+        num_samples = st.sidebar.slider("Number of samples", 1, 10, 3)
+        st.video(st.session_state.uploaded_video)
+        st.text_area("Video Description", "This is a fixed video description", disabled=True)
+        st.text_area("Music Description")
+        if st.button("Generate Music"):
+            st.session_state.page = 'result'
+            st.session_state.device = device
+            st.session_state.num_samples = num_samples
+    # Page 2b (If user selects "Prompt to Music" in Page 1)
+    if st.session_state.page == 'prompt_to_music':
+        st.sidebar.title("Settings")
+        device = st.sidebar.selectbox("Select Device", ["GPU", "CPU"], index=0)
+        num_samples = st.sidebar.slider("Number of samples", 1, 10, 3)
+        if st.button("Generate Music"):
+            st.session_state.page = 'result'
+            st.session_state.device = device
+            st.session_state.num_samples = num_samples
+    # Page 3 (Results Page)
+    if st.session_state.page == 'result':
+        st.header("Generated Music")
+        for i in range(st.session_state.num_samples):
+            st.write(f"Music Sample {i+1}")
+            st.audio(f"Generated Music {i+1}.mp3", format='audio/mp3')
+            st.download_button(f"Download Music {i+1}", f"Generated Music {i+1}.mp3")
+        if st.button("Start Over"):
+            st.session_state.page = 'main'
+if __name__ == "__main__":
+    main()

run_test.sh DELETED Viewed

@@ -1,28 +0,0 @@
-#!/bin/bash
-echo "Script started."
-# Run server
-echo "Starting server..."
-python server.py --duration 10 &
-echo "Server started."
-# Sleep
-echo "Waiting for the server to startup..."
-sleep 10
-# Run client
-echo "Starting client..."
-python client.py --server_url http://localhost:8000 --prompts "Lofi Music for Coding" --output_file output.wav
-echo "Client finished."
-# Kill server
-echo "Killing server..."
-kill $(ps aux | grep 'server.py' | awk '{print $2}')
-# Done
-sleep 5
-echo "Script finished."

server.py DELETED Viewed

@@ -1,90 +0,0 @@
-import warnings
-import argparse
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from typing import List, Optional
-import torch
-from torch.cuda import memory_allocated, memory_reserved
-from audiocraft.models import musicgen
-import numpy as np
-import io
-from fastapi.responses import StreamingResponse, JSONResponse
-from scipy.io.wavfile import write as wav_write
-import uvicorn
-import psutil
-warnings.simplefilter('ignore')
-# Parse command line arguments
-parser = argparse.ArgumentParser(description="Music Generation Server")
-parser.add_argument("--model", type=str, default="musicgen-stereo-small", help="Pretrained model name")
-parser.add_argument("--device", type=str, default="cuda", help="Device to load the model on")
-parser.add_argument("--duration", type=int, default=10, help="Duration of generated music in seconds")
-parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the server on")
-parser.add_argument("--port", type=int, default=8000, help="Port to run the server on")
-args = parser.parse_args()
-# Initialize the FastAPI app
-app = FastAPI()
-# Build the model name based on the provided arguments
-if args.model.startswith('facebook/'):
-    args.model_name = args.model
-else:
-    args.model_name = f"facebook/{args.model}"
-# Load the model with the provided arguments
-try:
-    musicgen_model = musicgen.MusicGen.get_pretrained(args.model_name, device=args.device)
-    model_loaded = True
-except Exception as e:
-    musicgen_model = None
-    model_loaded = False
-class MusicRequest(BaseModel):
-    prompts: List[str]
-    duration: Optional[int] = 10  # Default duration is 10 seconds if not provided
-@app.post("/generate_music")
-def generate_music(request: MusicRequest):
-    if not model_loaded:
-        raise HTTPException(status_code=500, detail="Model is not loaded.")
-    try:
-        musicgen_model.set_generation_params(duration=request.duration)
-        result = musicgen_model.generate(request.prompts, progress=False)
-        result = result.squeeze().cpu().numpy().T
-        sample_rate = musicgen_model.sample_rate
-        buffer = io.BytesIO()
-        wav_write(buffer, sample_rate, result)
-        buffer.seek(0)
-        return StreamingResponse(buffer, media_type="audio/wav")
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/health")
-def health_check():
-    cpu_usage = psutil.cpu_percent(interval=1)
-    ram_usage = psutil.virtual_memory().percent
-    stats = {
-        "server_running": True,
-        "model_loaded": model_loaded,
-        "cpu_usage_percent": cpu_usage,
-        "ram_usage_percent": ram_usage
-    }
-    if args.device == "cuda" and torch.cuda.is_available():
-        gpu_memory_allocated = memory_allocated()
-        gpu_memory_reserved = memory_reserved()
-        stats.update({
-            "gpu_memory_allocated": gpu_memory_allocated,
-            "gpu_memory_reserved": gpu_memory_reserved
-        })
-    return JSONResponse(content=stats)
-if __name__ == "__main__":
-    uvicorn.run(app, host=args.host, port=args.port)