Case-Study-1

Sleeping

App Files Files Community

Julian-Hans commited on Sep 14

Commit

ddf2ccc

•

1 Parent(s): 6b9fb43

renamed poc_app.py to app.py, clean up for app logic, implemented function to yield intermediate results, added generation length parameter to config, changed path handling in app.py

Browse files

Files changed (11) hide show

README.md +18 -0
__pycache__/blip_image_caption_large.cpython-311.pyc +0 -0
__pycache__/config.cpython-311.pyc +0 -0
__pycache__/musicgen_small.cpython-311.pyc +0 -0
__pycache__/phi3_mini_4k_instruct.cpython-311.pyc +0 -0
app.py +89 -0
config.py +8 -1
musicgen_small.py +3 -3
phi3_mini_4k_instruct.py +2 -0
poc_app.py +0 -80
requirements.txt +43 -0

README.md ADDED Viewed

	@@ -0,0 +1,18 @@

+---
+title: Case-Study-1: Image-To-Music
+emoji: 🎼
+colorFrom: gray
+colorTo: blue
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+---
+## Case-Study-1: Image-To-Music 🎼
+An image to music converter, built with the following models:
+- https://huggingface.co/Salesforce/blip-image-captioning-large for Image Captioning
+- https://huggingface.co/microsoft/Phi-3-mini-4k-instruct       for Audio Prompt generation with Caption
+- https://huggingface.co/facebook/musicgen-small                for Music Generation
+Currently supports .jpg, .jpeg, and .png!

__pycache__/blip_image_caption_large.cpython-311.pyc ADDED Viewed

Binary file (1.11 kB). View file

__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (484 Bytes). View file

__pycache__/musicgen_small.cpython-311.pyc ADDED Viewed

Binary file (1.46 kB). View file

__pycache__/phi3_mini_4k_instruct.cpython-311.pyc ADDED Viewed

Binary file (1.36 kB). View file

app.py CHANGED Viewed

	@@ -0,0 +1,89 @@

+# external imports
+import time
+import uuid
+import gradio as gr
+# local imports
+from blip_image_caption_large import Blip_Image_Caption_Large
+from phi3_mini_4k_instruct import Phi3_Mini_4k_Instruct
+from musicgen_small import Musicgen_Small
+import config
+class Image_To_Music:
+    def __init__(self):
+        self.image_caption_model = Blip_Image_Caption_Large()
+        self.text_generation_model = Phi3_Mini_4k_Instruct()
+        self.music_generation_model = Musicgen_Small()
+        self.image_path = None
+        self.generated_caption = None
+        self.generated_description = None
+        self.audio_path = config.AUDIO_DIR + str(uuid.uuid4()) + ".wav"
+        self.caption_generation_duration = -1
+        self.description_generation_duration = -1
+        self.music_generation_duration = -1
+    def caption_image(self, image_path):
+        caption_start_time = time.time()
+        self.image_path = image_path
+        self.generated_caption = self.image_caption_model.caption_image_local_pipeline(self.image_path)[0]["generated_text"]
+        self.caption_generation_duration = time.time() - caption_start_time
+        return self.generated_caption
+    def generate_description(self):
+        description_start_time = time.time()
+        messages = [
+            {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
+            {"role": "user", "content": self.generated_caption},
+        ]
+        self.generated_description = self.text_generation_model.generate_text_local_pipeline(messages)[-1]['generated_text'][-1]['content']
+        self.description_generation_duration = time.time() - description_start_time
+        return self.generated_description
+    def generate_music(self):
+        music_start_time = time.time()
+        self.music_generation_model.generate_music_local_pipeline(self.generated_description, self.audio_path)
+        self.music_generation_duration = time.time() - music_start_time
+        return self.audio_path
+    def get_durations(self):
+            return f"Caption Generation Time: {self.caption_generation_duration:.2f} seconds\nDescription Generation Time: {self.description_generation_duration:.2f} seconds\nMusic Generation Time: {self.music_generation_duration:.2f} seconds\nTotal Time: {self.caption_generation_duration + self.description_generation_duration + self.music_generation_duration:.2f} seconds"
+    def run_yield(self, image_path):
+        self.caption_image(image_path)
+        yield [self.generated_caption, None, None, None]
+        self.generate_description()
+        yield [self.generated_caption, self.generated_description, None, None]
+        self.generate_music()
+        yield [self.generated_caption, self.generated_description, self.audio_path, None]
+        return [self.generated_caption, self.generated_description, self.audio_path,self.get_durations()]
+    def run(self, image_path):
+        self.caption_image(image_path)
+        self.generate_description()
+        self.generate_music()
+        return [self.generated_caption, self.generated_description, self.audio_path, self.get_durations()]
+# Gradio UI
+def gradio():
+    # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
+    with gr.Blocks() as demo:
+        gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
+        image_input = gr.Image(type="filepath", label="Upload Image")
+        with gr.Row():
+            caption_output = gr.Textbox(label="Image Caption")
+            music_description_output = gr.Textbox(label="Music Description")
+            durations = gr.Textbox(label="Processing Times", interactive=False, placeholder="Time statistics will appear here")
+        music_output = gr.Audio(label="Generated Music")
+        # Button to trigger the process
+        generate_button = gr.Button("Generate Music")
+        itm = Image_To_Music()
+        generate_button.click(fn=itm.run, inputs=image_input, outputs=[caption_output, music_description_output, music_output, durations])
+    # Launch Gradio app
+    demo.launch()
+gradio()

config.py CHANGED Viewed

@@ -1,3 +1,10 @@
 IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
 LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
-MUSICGEN_MODEL = "facebook/musicgen-small"

 IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
 LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
+LLM_MAX_LENGTH = 50
+LLM_MAX_NEW_TOKENS = 50
+MUSICGEN_MODEL = "facebook/musicgen-small"
+MUSICGEN_MAX_NEW_TOKENS = 256 # 5 seconds of audio
+AUDIO_DIR = "Case-Study-1/data/"

musicgen_small.py CHANGED Viewed

@@ -9,6 +9,6 @@ class Musicgen_Small:
     def __init__(self):
         self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
-    def generate_music_local_pipeline(self, prompt):
-        music = self.local_pipeline(prompt, forward_params={"do_sample": True})
-        scipy.io.wavfile.write("data/musicgen_out.wav", rate=music["sampling_rate"], data=music["audio"])

     def __init__(self):
         self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
+    def generate_music_local_pipeline(self, prompt, audio_path):
+        music = self.local_pipeline(prompt, forward_params={"do_sample": True, "max_new_tokens": config.MUSICGEN_MAX_NEW_TOKENS})
+        scipy.io.wavfile.write(audio_path, rate=music["sampling_rate"], data=music["audio"])

phi3_mini_4k_instruct.py CHANGED Viewed

@@ -8,6 +8,8 @@ import config
 class Phi3_Mini_4k_Instruct:
     def __init__(self):
         self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
     def generate_text_local_pipeline(self, messages):
         result = self.local_pipeline(messages)

 class Phi3_Mini_4k_Instruct:
     def __init__(self):
         self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
+        self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
+        self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
     def generate_text_local_pipeline(self, messages):
         result = self.local_pipeline(messages)

poc_app.py DELETED Viewed

@@ -1,80 +0,0 @@
-# external imports
-import time
-import gradio as gr
-# local imports
-from blip_image_caption_large import Blip_Image_Caption_Large
-from phi3_mini_4k_instruct import Phi3_Mini_4k_Instruct
-from musicgen_small import Musicgen_Small
-#image_to_music function
-def image_to_music(image_path):
-    # test image captioning
-    image_caption_start_time = time.time()
-    image_caption_model = Blip_Image_Caption_Large()
-    test_caption = image_caption_model.caption_image_local_pipeline(image_path)
-    print(test_caption)
-    image_caption_end_time = time.time()
-    # test text generation
-    text_generation_start_time = time.time()
-    text_generation_model = Phi3_Mini_4k_Instruct()
-    #TODO: move this to a config file
-    text_generation_model.local_pipeline.model.config.max_new_tokens = 200
-    #TODO: move system prompt somewhere else, allow for genre override
-    messages = [
-    {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
-    {"role": "user", "content": test_caption[0]["generated_text"]},
-    ]
-    test_text = text_generation_model.generate_text_local_pipeline(messages)
-    print(test_text)
-    text_generation_end_time = time.time()
-    # test audio generation
-    music_generation_start_time = time.time()
-    music_generation_model = Musicgen_Small()
-    music_generation_model.generate_music_local_pipeline(str(test_text[-1]['generated_text'][-1]['content']))
-    music_generation_end_time = time.time()
-    # calculate durations
-    image_caption_duration = image_caption_end_time - image_caption_start_time
-    text_generation_duration = text_generation_end_time - text_generation_start_time
-    music_generation_duration = music_generation_end_time - music_generation_start_time
-    total_duration = music_generation_end_time - image_caption_start_time
-    # output generated_text, audio and duration to gradio
-    return (test_caption[0]["generated_text"], test_text[-1]['generated_text'][-1]['content'], "data/musicgen_out.wav",
-            f"Image Captioning Duration: {image_caption_duration} sec",
-            f"Text Generation Duration: {text_generation_duration} sec",
-            f"Music Generation Duration: {music_generation_duration} sec",
-            f"Total Duration: {total_duration} sec")
-# Gradio UI
-def gradio():
-    # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
-    with gr.Blocks() as demo:
-        gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
-        image_input = gr.Image(type="filepath", label="Upload Image")
-        with gr.Row():
-            caption_output = gr.Textbox(label="Image Caption")
-            music_description_output = gr.Textbox(label="Music Description")
-            durations = gr.Textbox(label="Processing Times", interactive=False, placeholder="Time statistics will appear here")
-        music_output = gr.Audio(label="Generated Music")
-        # Button to trigger the process
-        generate_button = gr.Button("Generate Music")
-        generate_button.click(fn=image_to_music, inputs=[image_input], outputs=[caption_output, music_description_output, music_output, durations])
-    # Launch Gradio app
-    demo.launch()
-gradio()

requirements.txt CHANGED Viewed

@@ -1,27 +1,70 @@
 certifi==2024.8.30
 charset-normalizer==3.3.2
 filelock==3.16.0
 fsspec==2024.9.0
 huggingface-hub==0.24.6
 idna==3.8
 Jinja2==3.1.4
 MarkupSafe==2.1.5
 mpmath==1.3.0
 networkx==3.3
 numpy==2.1.1
 packaging==24.1
 pillow==10.4.0
 PyYAML==6.0.2
 regex==2024.9.11
 requests==2.32.3
 safetensors==0.4.5
 scipy==1.14.1
 sympy==1.13.2
 tokenizers==0.19.1
 torch==2.4.1
 torchaudio==2.4.1
 torchvision==0.19.1
 tqdm==4.66.5
 transformers==4.44.2
 typing_extensions==4.12.2
 urllib3==2.2.2

+accelerate==0.34.2
+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.4.0
 certifi==2024.8.30
 charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.3.0
+cycler==0.12.1
+fastapi==0.114.2
+ffmpy==0.4.0
 filelock==3.16.0
+fonttools==4.53.1
 fsspec==2024.9.0
+gradio==4.44.0
+gradio_client==1.3.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
 huggingface-hub==0.24.6
 idna==3.8
+importlib_resources==6.4.5
 Jinja2==3.1.4
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
 MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
 mpmath==1.3.0
 networkx==3.3
 numpy==2.1.1
+orjson==3.10.7
 packaging==24.1
+pandas==2.2.2
 pillow==10.4.0
+psutil==6.0.0
+pydantic==2.9.1
+pydantic_core==2.23.3
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.2
 PyYAML==6.0.2
 regex==2024.9.11
 requests==2.32.3
+rich==13.8.1
+ruff==0.6.5
 safetensors==0.4.5
 scipy==1.14.1
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.38.5
 sympy==1.13.2
 tokenizers==0.19.1
+tomlkit==0.12.0
 torch==2.4.1
 torchaudio==2.4.1
 torchvision==0.19.1
 tqdm==4.66.5
 transformers==4.44.2
+typer==0.12.5
 typing_extensions==4.12.2
+tzdata==2024.1
 urllib3==2.2.2
+uvicorn==0.30.6
+websockets==12.0