Case-Study-1

Sleeping

App Files Files Community

Julian-Hans commited on Sep 16

Commit

fa554aa

•

1 Parent(s): a15bc9b

implemented option to use inference endpoints, implemented parameter selection, updated UI, cleaned up return formats of models

Browse files

Files changed (5) hide show

app.py +50 -9
blip_image_caption_large.py +15 -2
config.py +14 -1
musicgen_small.py +33 -1
phi3_mini_4k_instruct.py +18 -3

app.py CHANGED Viewed

@@ -16,7 +16,12 @@ log.basicConfig(level=log.INFO)
 class Image_To_Music:
-    def __init__(self):
         self.image_path = None
         self.generated_caption = None
         self.generated_description = None
@@ -44,14 +49,14 @@ class Image_To_Music:
         self.image_caption_model = Blip_Image_Caption_Large()
         self.image_path = image_path
-        self.generated_caption = self.image_caption_model.caption_image_local_pipeline(self.image_path)[0]["generated_text"]
         # delete model to free up ram
         del self.image_caption_model
         gc.collect()
         self.caption_generation_duration = time.time() - caption_start_time
-        log.info(f"Captioning Complete in {self.caption_generation_duration:.2f} seconds: {self.generated_caption}")
         return self.generated_caption
     def generate_description(self):
@@ -65,14 +70,14 @@ class Image_To_Music:
             {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
             {"role": "user", "content": self.generated_caption},
         ]
-        self.generated_description = self.text_generation_model.generate_text_local_pipeline(messages)[-1]['generated_text'][-1]['content']
         # delete model to free up ram
         del self.text_generation_model
         gc.collect()
         self.description_generation_duration = time.time() - description_start_time
-        log.info(f"Description Generation Complete in {self.description_generation_duration:.2f} seconds: {self.generated_description}")
         return self.generated_description
     def generate_music(self):
@@ -82,14 +87,14 @@ class Image_To_Music:
         # load model
         self.music_generation_model = Musicgen_Small()
-        self.music_generation_model.generate_music_local_pipeline(self.generated_description, self.audio_path)
         # delete model to free up ram
         del self.music_generation_model
         gc.collect()
         self.music_generation_duration = time.time() - music_start_time
-        log.info(f"Music Generation Complete in {self.music_generation_duration:.2f} seconds: {self.audio_path}")
         return self.audio_path
     def get_durations(self):
@@ -112,12 +117,49 @@ class Image_To_Music:
         return [self.generated_caption, self.generated_description, self.audio_path, self.get_durations()]
 # Gradio UI
 def gradio():
     # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
     with gr.Blocks() as demo:
         gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
         image_input = gr.Image(type="filepath", label="Upload Image")
         with gr.Row():
             caption_output = gr.Textbox(label="Image Caption")
             music_description_output = gr.Textbox(label="Music Description")
@@ -126,8 +168,7 @@ def gradio():
         music_output = gr.Audio(label="Generated Music")
         # Button to trigger the process
         generate_button = gr.Button("Generate Music")
-        itm = Image_To_Music()
-        generate_button.click(fn=itm.run, inputs=image_input, outputs=[caption_output, music_description_output, music_output, durations])
     # Launch Gradio app
     demo.launch()

 class Image_To_Music:
+    def __init__(self, use_local_caption=False, use_local_llm=False, use_local_musicgen=False):
+        self.use_local_llm = use_local_llm
+        self.use_local_caption = use_local_caption
+        self.use_local_musicgen = use_local_musicgen
         self.image_path = None
         self.generated_caption = None
         self.generated_description = None
         self.image_caption_model = Blip_Image_Caption_Large()
         self.image_path = image_path
+        self.generated_caption = self.image_caption_model.caption_image(self.image_path, self.use_local_caption)
         # delete model to free up ram
         del self.image_caption_model
         gc.collect()
         self.caption_generation_duration = time.time() - caption_start_time
+        log.info(f"Captioning Complete in {self.caption_generation_duration:.2f} seconds: {self.generated_caption} - used local model: {self.use_local_caption}")
         return self.generated_caption
     def generate_description(self):
             {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
             {"role": "user", "content": self.generated_caption},
         ]
+        self.generated_description = self.text_generation_model.generate_text(messages, self.use_local_llm)
         # delete model to free up ram
         del self.text_generation_model
         gc.collect()
         self.description_generation_duration = time.time() - description_start_time
+        log.info(f"Description Generation Complete in {self.description_generation_duration:.2f} seconds: {self.generated_description} - used local model: {self.use_local_llm}")
         return self.generated_description
     def generate_music(self):
         # load model
         self.music_generation_model = Musicgen_Small()
+        self.music_generation_model.generate_music(self.generated_description, self.audio_path, self.use_local_musicgen)
         # delete model to free up ram
         del self.music_generation_model
         gc.collect()
         self.music_generation_duration = time.time() - music_start_time
+        log.info(f"Music Generation Complete in {self.music_generation_duration:.2f} seconds: {self.audio_path} - used local model: {self.use_local_musicgen}")
         return self.audio_path
     def get_durations(self):
         return [self.generated_caption, self.generated_description, self.audio_path, self.get_durations()]
+def run_image_to_music(image_path, llm_max_new_tokens, llm_temperature, llm_top_p, musicgen_max_seconds, use_local_caption, use_local_llm, use_local_musicgen):
+    config.LLM_MAX_NEW_TOKENS = llm_max_new_tokens
+    config.LLM_TEMPERATURE = llm_temperature
+    config.LLM_TOP_P = llm_top_p
+    config.MUSICGEN_MAX_NEW_TOKENS = musicgen_max_seconds * 51
+    itm = Image_To_Music(use_local_caption=use_local_caption, use_local_llm=use_local_llm, use_local_musicgen=use_local_musicgen)
+    return itm.run(image_path)
 # Gradio UI
 def gradio():
     # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
     with gr.Blocks() as demo:
         gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
         image_input = gr.Image(type="filepath", label="Upload Image")
+        # ----ATTRIBUTION-START----
+        # LLM: ChatGPT4o
+        # PROMPT: i need 3 checkbox fields that pass booleans to the run_image_to_music function. it should be  "Use local Image Captioning" "Use local LLM" "Use local Music Generation". please make it a nice parameter selector
+        # EDITS: /
+        # Checkbox parameters
+        with gr.Row():
+            local_captioning = gr.Checkbox(label="Use local Image Captioning", value=False)
+            local_llm = gr.Checkbox(label="Use local LLM", value=False)
+            local_music_gen = gr.Checkbox(label="Use local Music Generation", value=False)
+        # -----ATTRIBUTION-END-----
+        # ----ATTRIBUTION-START----
+        # LLM: ChatGPT4o
+        # PROMPT: now, i need sliders for the different models that are used in the product:\n LLM_MAX_NEW_TOKENS = 50\nLLM_TEMPERATURE = 0.7\nLLM_TOP_P = 0.95\nMUSICGEN_MAX_NEW_TOKENS = 256 # 256 =  5 seconds of audio\n they should be in a hidden menu that opens when i click on "advanced options"\nplease label them for the end user and fit them nicely in the following ui: <code>
+        # EDITS: added interactive flags
+        # Advanced options with sliders
+        with gr.Accordion("Advanced Options", open=False):
+            gr.Markdown("<h3>LLM Settings</h3>")
+            llm_max_new_tokens = gr.Slider(1, 200, value=50, step=1, label="LLM Max Tokens", interactive=True)
+            llm_temperature = gr.Slider(0.0, 1.0, value=0.7, step=0.01, label="LLM Temperature", interactive=True)
+            llm_top_p = gr.Slider(0.01, 0.99, value=0.95, step=0.01, label="LLM Top P", interactive=True)
+            gr.Markdown("<h3>Music Generation Settings</h3>")
+            musicgen_max_seconds = gr.Slider(1, 30, value=5, step=1, label="MusicGen Duration in Seconds (local model only)", interactive=True)
+        # -----ATTRIBUTION-END-----
         with gr.Row():
             caption_output = gr.Textbox(label="Image Caption")
             music_description_output = gr.Textbox(label="Music Description")
         music_output = gr.Audio(label="Generated Music")
         # Button to trigger the process
         generate_button = gr.Button("Generate Music")
+        generate_button.click(fn=run_image_to_music, inputs=[image_input, llm_max_new_tokens, llm_temperature, llm_top_p, musicgen_max_seconds, local_captioning, local_llm, local_music_gen], outputs=[caption_output, music_description_output, music_output, durations])
     # Launch Gradio app
     demo.launch()

blip_image_caption_large.py CHANGED Viewed

@@ -1,13 +1,26 @@
 # external imports
 from transformers import pipeline
 # local imports
 import config
 class Blip_Image_Caption_Large:
     def __init__(self):
-        self.local_pipeline = pipeline("image-to-text", model=config.IMAGE_CAPTION_MODEL)
     def caption_image_local_pipeline(self, image_path):
-        result = self.local_pipeline(image_path)
         return result

 # external imports
 from transformers import pipeline
+from huggingface_hub import InferenceClient
 # local imports
 import config
 class Blip_Image_Caption_Large:
     def __init__(self):
+        pass
+    def caption_image(self, image_path, use_local_caption):
+        if use_local_caption:
+            return self.caption_image_local_pipeline(image_path)
+        else:
+            return self.caption_image_api(image_path)
     def caption_image_local_pipeline(self, image_path):
+        self.local_pipeline = pipeline("image-to-text", model=config.IMAGE_CAPTION_MODEL)
+        result = self.local_pipeline(image_path)[0]['generated_text']
         return result
+    def caption_image_api(self, image_path):
+        client = InferenceClient(config.IMAGE_CAPTION_MODEL, token=config.HF_API_TOKEN)
+        result = client.image_to_text(image_path).generated_text
+        return result

config.py CHANGED Viewed

@@ -1,10 +1,23 @@
 IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
 LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
 LLM_MAX_LENGTH = 50
 LLM_MAX_NEW_TOKENS = 50
 MUSICGEN_MODEL = "facebook/musicgen-small"
 MUSICGEN_MAX_NEW_TOKENS = 256 # 5 seconds of audio
-AUDIO_DIR = "Case-Study-1/data/"

+import os
+import logging as log
+log.basicConfig(level=log.INFO)
 IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
 LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
 LLM_MAX_LENGTH = 50
 LLM_MAX_NEW_TOKENS = 50
+LLM_TEMPERATURE = 0.7
+LLM_TOP_P = 0.95
 MUSICGEN_MODEL = "facebook/musicgen-small"
+MUSICGEN_MODEL_API_URL = f"https://api-inference.huggingface.co/models/{MUSICGEN_MODEL}"
 MUSICGEN_MAX_NEW_TOKENS = 256 # 5 seconds of audio
+AUDIO_DIR = "Case-Study-1/data/"
+HF_API_TOKEN = os.getenv("HF_API_TOKEN")
+if HF_API_TOKEN:
+    log.info(f"Read HF_API_TOKEN: {HF_API_TOKEN[0:4]}...")
+else:
+    print("HF_API_TOKEN not found in environment variables.")

musicgen_small.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # external imports
 from transformers import pipeline
 import scipy
 # local imports
@@ -7,8 +9,38 @@ import config
 class Musicgen_Small:
     def __init__(self):
-        self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
     def generate_music_local_pipeline(self, prompt, audio_path):
         music = self.local_pipeline(prompt, forward_params={"do_sample": True, "max_new_tokens": config.MUSICGEN_MAX_NEW_TOKENS})
         scipy.io.wavfile.write(audio_path, rate=music["sampling_rate"], data=music["audio"])

 # external imports
 from transformers import pipeline
+from io import BytesIO
+import requests
 import scipy
 # local imports
 class Musicgen_Small:
     def __init__(self):
+        pass
+    def generate_music(self, prompt, audio_path, use_local_musicgen):
+        if use_local_musicgen:
+            self.generate_music_local_pipeline(prompt, audio_path)
+        else:
+            self.generate_music_api(prompt, audio_path)
     def generate_music_local_pipeline(self, prompt, audio_path):
+        self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
         music = self.local_pipeline(prompt, forward_params={"do_sample": True, "max_new_tokens": config.MUSICGEN_MAX_NEW_TOKENS})
         scipy.io.wavfile.write(audio_path, rate=music["sampling_rate"], data=music["audio"])
+    def generate_music_api(self, prompt, audio_path):
+        headers =  {"Authorization": f"Bearer {config.HF_API_TOKEN}"}
+        payload = {
+            "inputs": prompt
+        }
+        response = requests.post(config.MUSICGEN_MODEL_API_URL, headers=headers, json=payload)
+        # ----ATTRIBUTION-START----
+        # LLM: ChatGPT4o
+        # PROMPT: please save the audio to a .wav file
+        # EDITS: changed variables to match the code
+        # Convert the byte content into an audio array
+        audio_buffer = BytesIO(response.content)
+        # Use scipy to save the audio, assuming it's a WAV format audio stream
+        # If it's raw PCM audio, you would need to decode it first.
+        with open(audio_path, "wb") as f:
+            f.write(audio_buffer.read())
+        # -----ATTRIBUTION-END-----

phi3_mini_4k_instruct.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # external imports
 from transformers import pipeline
 # local imports
 import config
@@ -7,10 +8,24 @@ import config
 class Phi3_Mini_4k_Instruct:
     def __init__(self):
         self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
         self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
         self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
-    def generate_text_local_pipeline(self, messages):
-        result = self.local_pipeline(messages)
         return result

 # external imports
 from transformers import pipeline
+from huggingface_hub import InferenceClient
 # local imports
 import config
 class Phi3_Mini_4k_Instruct:
     def __init__(self):
+        pass
+    def generate_text(self, messages, use_local_llm):
+        if use_local_llm:
+            return self.generate_text_local_pipeline(messages)
+        else:
+            return self.generate_text_api(messages)
+    def generate_text_local_pipeline(self, messages):
         self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
         self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
         self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
+        self.local_pipeline.model.config.temperature = config.LLM_TEMPERATURE
+        self.local_pipeline.model.config.top_p = config.LLM_TOP_P
+        result = self.local_pipeline(messages)[-1]['generated_text'][-1]['content']
         return result
+    def generate_text_api(self, messages):
+        client = InferenceClient(config.LLM_MODEL, token=config.HF_API_TOKEN)
+        result = client.chat_completion(messages, max_tokens=config.LLM_MAX_NEW_TOKENS, temperature=config.LLM_TEMPERATURE, top_p=config.LLM_TOP_P).choices[0].message.content
+        return result