huchiahsi commited on
Commit
88c11ee
β€’
1 Parent(s): 51421b7

Upload 13 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ examples/mona_diner.png filter=lfs diff=lfs merge=lfs -text
2
+ examples/santa.png filter=lfs diff=lfs merge=lfs -text
3
+ examples/winter_hiking.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import spaces
4
+ import json
5
+ import re
6
+ import random
7
+ import numpy as np
8
+ from gradio_client import Client
9
+ hf_token = os.environ.get("HF_TOKEN")
10
+
11
+ MAX_SEED = np.iinfo(np.int32).max
12
+
13
+ def check_api(model_name):
14
+ if model_name == "MAGNet":
15
+ try :
16
+ client = Client("https://fffiloni-magnet.hf.space/")
17
+ return "api ready"
18
+ except :
19
+ return "api not ready yet"
20
+ elif model_name == "AudioLDM-2":
21
+ try :
22
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
23
+ return "api ready"
24
+ except :
25
+ return "api not ready yet"
26
+ elif model_name == "Riffusion":
27
+ try :
28
+ client = Client("https://fffiloni-spectrogram-to-music.hf.space/")
29
+ return "api ready"
30
+ except :
31
+ return "api not ready yet"
32
+ elif model_name == "Mustango":
33
+ try :
34
+ client = Client("https://declare-lab-mustango.hf.space/")
35
+ return "api ready"
36
+ except :
37
+ return "api not ready yet"
38
+ elif model_name == "MusicGen":
39
+ try :
40
+ client = Client("https://facebook-musicgen.hf.space/")
41
+ return "api ready"
42
+ except :
43
+ return "api not ready yet"
44
+ elif model_name == "Stable Audio Open":
45
+ try:
46
+ client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
47
+ return "api ready"
48
+ except:
49
+ return "api not ready yet"
50
+
51
+
52
+ from moviepy.editor import VideoFileClip
53
+ from moviepy.audio.AudioClip import AudioClip
54
+
55
+ def extract_audio(video_in):
56
+ input_video = video_in
57
+ output_audio = 'audio.wav'
58
+
59
+ # Open the video file and extract the audio
60
+ video_clip = VideoFileClip(input_video)
61
+ audio_clip = video_clip.audio
62
+
63
+ # Save the audio as a .wav file
64
+ audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
65
+ print("Audio extraction complete.")
66
+
67
+ return 'audio.wav'
68
+
69
+
70
+
71
+ def get_caption(image_in):
72
+ kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
73
+ kosmos2_result = kosmos2_client.predict(
74
+ image_in, # str (filepath or URL to image) in 'Test Image' Image component
75
+ "Detailed", # str in 'Description Type' Radio component
76
+ fn_index=4
77
+ )
78
+
79
+ print(f"KOSMOS2 RETURNS: {kosmos2_result}")
80
+
81
+ with open(kosmos2_result[1], 'r') as f:
82
+ data = json.load(f)
83
+
84
+ reconstructed_sentence = []
85
+ for sublist in data:
86
+ reconstructed_sentence.append(sublist[0])
87
+
88
+ full_sentence = ' '.join(reconstructed_sentence)
89
+ #print(full_sentence)
90
+
91
+ # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
92
+ pattern = r'^Describe this image in detail:\s*(.*)$'
93
+ # Apply the regex pattern to extract the description text.
94
+ match = re.search(pattern, full_sentence)
95
+ if match:
96
+ description = match.group(1)
97
+ print(description)
98
+ else:
99
+ print("Unable to locate valid description.")
100
+
101
+ # Find the last occurrence of "."
102
+ #last_period_index = full_sentence.rfind('.')
103
+
104
+ # Truncate the string up to the last period
105
+ #truncated_caption = full_sentence[:last_period_index + 1]
106
+
107
+ # print(truncated_caption)
108
+ #print(f"\nβ€”\nIMAGE CAPTION: {truncated_caption}")
109
+
110
+ return description
111
+
112
+ def get_caption_from_MD(image_in):
113
+ client = Client("https://vikhyatk-moondream1.hf.space/")
114
+ result = client.predict(
115
+ image_in, # filepath in 'image' Image component
116
+ "Describe precisely the image.", # str in 'Question' Textbox component
117
+ api_name="/answer_question"
118
+ )
119
+ print(result)
120
+ return result
121
+
122
+ def get_magnet(prompt):
123
+
124
+ client = Client("https://fffiloni-magnet.hf.space/")
125
+ result = client.predict(
126
+ "facebook/magnet-small-10secs", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component
127
+ "", # str in 'Model Path (custom models)' Textbox component
128
+ prompt, # str in 'Input Text' Textbox component
129
+ 3, # float in 'Temperature' Number component
130
+ 0.9, # float in 'Top-p' Number component
131
+ 10, # float in 'Max CFG coefficient' Number component
132
+ 1, # float in 'Min CFG coefficient' Number component
133
+ 20, # float in 'Decoding Steps (stage 1)' Number component
134
+ 10, # float in 'Decoding Steps (stage 2)' Number component
135
+ 10, # float in 'Decoding Steps (stage 3)' Number component
136
+ 10, # float in 'Decoding Steps (stage 4)' Number component
137
+ "prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component
138
+ api_name="/predict_full"
139
+ )
140
+ print(result)
141
+ return result[1]
142
+
143
+ def get_audioldm(prompt):
144
+ client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
145
+ seed = random.randint(0, MAX_SEED)
146
+ result = client.predict(
147
+ prompt, # str in 'Input text' Textbox component
148
+ "Low quality.", # str in 'Negative prompt' Textbox component
149
+ 10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
150
+ 6.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
151
+ seed, # int | float in 'Seed' Number component
152
+ 3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
153
+ fn_index=1
154
+ )
155
+ print(result)
156
+ audio_result = extract_audio(result)
157
+ return audio_result
158
+
159
+ def get_riffusion(prompt):
160
+ client = Client("https://fffiloni-spectrogram-to-music.hf.space/")
161
+ result = client.predict(
162
+ prompt, # str in 'Musical prompt' Textbox component
163
+ "", # str in 'Negative prompt' Textbox component
164
+ None, # filepath in 'parameter_4' Audio component
165
+ 10, # float (numeric value between 5 and 10) in 'Duration in seconds' Slider component
166
+ api_name="/predict"
167
+ )
168
+ print(result)
169
+ return result[1]
170
+
171
+ def get_mustango(prompt):
172
+ client = Client("https://declare-lab-mustango.hf.space/")
173
+ result = client.predict(
174
+ prompt, # str in 'Prompt' Textbox component
175
+ 200, # float (numeric value between 100 and 200) in 'Steps' Slider component
176
+ 6, # float (numeric value between 1 and 10) in 'Guidance Scale' Slider component
177
+ api_name="/predict"
178
+ )
179
+ print(result)
180
+ return result
181
+
182
+ def get_musicgen(prompt):
183
+ client = Client("https://facebook-musicgen.hf.space/")
184
+ result = client.predict(
185
+ prompt, # str in 'Describe your music' Textbox component
186
+ None, # str (filepath or URL to file) in 'File' Audio component
187
+ fn_index=0
188
+ )
189
+ print(result)
190
+ return result[1]
191
+
192
+ def get_stable_audio_open(prompt):
193
+ client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
194
+ result = client.predict(
195
+ prompt=prompt,
196
+ seconds_total=10,
197
+ steps=100,
198
+ cfg_scale=7,
199
+ api_name="/predict"
200
+ )
201
+ print(result)
202
+ return result
203
+
204
+ import re
205
+ import torch
206
+ from transformers import pipeline
207
+
208
+ zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
209
+ mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
210
+
211
+ pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")
212
+
213
+ standard_sys = f"""
214
+ You are a musician AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users.
215
+ In particular, you need to respond succintly with few musical words, in a friendly tone, write a musical prompt for a music generation model.
216
+
217
+ For example, if a user says, "a picture of a man in a black suit and tie riding a black dragon", provide immediately a musical prompt corresponding to the image description.
218
+ Immediately STOP after that. It should be EXACTLY in this format:
219
+ "A grand orchestral arrangement with thunderous percussion, epic brass fanfares, and soaring strings, creating a cinematic atmosphere fit for a heroic battle"
220
+ """
221
+
222
+ mustango_sys = f"""
223
+ You are a musician AI whose job is to help users create their own music which its genre will reflect the character or scene from an image described by users.
224
+ In particular, you need to respond succintly with few musical words, in a friendly tone, write a musical prompt for a music generation model, you MUST include chords progression.
225
+
226
+ For example, if a user says, "a painting of three old women having tea party", provide immediately a musical prompt corresponding to the image description.
227
+ Immediately STOP after that. It should be EXACTLY in this format:
228
+ "The song is an instrumental. The song is in medium tempo with a classical guitar playing a lilting melody in accompaniment style. The song is emotional and romantic. The song is a romantic instrumental song. The chord sequence is Gm, F6, Ebm. The time signature is 4/4. This song is in Adagio. The key of this song is G minor."
229
+ """
230
+
231
+ @spaces.GPU(enable_queue=True)
232
+ def get_musical_prompt(user_prompt, chosen_model):
233
+
234
+ """
235
+ if chosen_model == "Mustango" :
236
+ agent_maker_sys = standard_sys
237
+ else :
238
+ agent_maker_sys = standard_sys
239
+ """
240
+ agent_maker_sys = standard_sys
241
+
242
+ instruction = f"""
243
+ <|system|>
244
+ {agent_maker_sys}</s>
245
+ <|user|>
246
+ """
247
+
248
+ prompt = f"{instruction.strip()}\n{user_prompt}</s>"
249
+ outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
250
+ pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
251
+ cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
252
+
253
+ print(f"SUGGESTED Musical prompt: {cleaned_text}")
254
+ return cleaned_text.lstrip("\n")
255
+
256
+ def infer(image_in, chosen_model, api_status):
257
+ if image_in == None :
258
+ raise gr.Error("Please provide an image input")
259
+
260
+ if chosen_model == [] :
261
+ raise gr.Error("Please pick a model")
262
+
263
+ if api_status == "api not ready yet" :
264
+ raise gr.Error("This model is not ready yet, you can pick another one instead :)")
265
+
266
+ gr.Info("Getting image caption with Kosmos-2...")
267
+ user_prompt = get_caption(image_in)
268
+ #user_prompt = get_caption_from_MD(image_in)
269
+
270
+ gr.Info("Building a musical prompt according to the image caption ...")
271
+ musical_prompt = get_musical_prompt(user_prompt, chosen_model)
272
+
273
+ if chosen_model == "MAGNet" :
274
+ gr.Info("Now calling MAGNet for music...")
275
+ music_o = get_magnet(musical_prompt)
276
+ elif chosen_model == "AudioLDM-2" :
277
+ gr.Info("Now calling AudioLDM-2 for music...")
278
+ music_o = get_audioldm(musical_prompt)
279
+ elif chosen_model == "Riffusion" :
280
+ gr.Info("Now calling Riffusion for music...")
281
+ music_o = get_riffusion(musical_prompt)
282
+ elif chosen_model == "Mustango" :
283
+ gr.Info("Now calling Mustango for music...")
284
+ music_o = get_mustango(musical_prompt)
285
+ elif chosen_model == "MusicGen" :
286
+ gr.Info("Now calling MusicGen for music...")
287
+ music_o = get_musicgen(musical_prompt)
288
+ elif chosen_model == "Stable Audio Open" :
289
+ gr.Info("Now calling Stable Audio Open for music...")
290
+ music_o = get_stable_audio_open(musical_prompt)
291
+
292
+ return gr.update(value=musical_prompt, interactive=True), gr.update(visible=True), music_o
293
+
294
+ def retry(chosen_model, caption):
295
+ musical_prompt = caption
296
+
297
+ if chosen_model == "MAGNet" :
298
+ gr.Info("Now calling MAGNet for music...")
299
+ music_o = get_magnet(musical_prompt)
300
+ elif chosen_model == "AudioLDM-2" :
301
+ gr.Info("Now calling AudioLDM-2 for music...")
302
+ music_o = get_audioldm(musical_prompt)
303
+ elif chosen_model == "Riffusion" :
304
+ gr.Info("Now calling Riffusion for music...")
305
+ music_o = get_riffusion(musical_prompt)
306
+ elif chosen_model == "Mustango" :
307
+ gr.Info("Now calling Mustango for music...")
308
+ music_o = get_mustango(musical_prompt)
309
+ elif chosen_model == "MusicGen" :
310
+ gr.Info("Now calling MusicGen for music...")
311
+ music_o = get_musicgen(musical_prompt)
312
+ elif chosen_model == "Stable Audio Open" :
313
+ gr.Info("Now calling Stable Audio Open for music...")
314
+ music_o = get_stable_audio_open(musical_prompt)
315
+
316
+ return music_o
317
+
318
+ demo_title = "εœ–η‰‡θ½‰ζ›ζˆιŸ³ζ¨‚η³»η΅±"
319
+ description = "ε°‡δΈŠε‚³ηš„ε½±η‰‡ζˆ–η―„δΎ‹ε½±η‰‡θ½‰ζ›η‚ΊιŸ³ζ¨‚"
320
+
321
+ css = """
322
+ #col-container {
323
+ margin: 0 auto;
324
+ max-width: 980px;
325
+ text-align: left;
326
+ }
327
+ #inspi-prompt textarea {
328
+ font-size: 20px;
329
+ line-height: 24px;
330
+ font-weight: 600;
331
+ }
332
+
333
+ """
334
+
335
+ with gr.Blocks(css=css) as demo:
336
+
337
+ with gr.Column(elem_id="col-container"):
338
+
339
+ gr.HTML(f"""
340
+ <h2 style="text-align: center;">{demo_title}</h2>
341
+ <p style="text-align: center;">{description}</p>
342
+ """)
343
+
344
+ with gr.Row():
345
+
346
+ with gr.Column():
347
+ image_in = gr.Image(
348
+ label = "δΈŠε‚³εœ–η‰‡ζͺ”ζ‘ˆ",
349
+ type = "filepath",
350
+ elem_id = "image-in"
351
+ )
352
+
353
+ with gr.Row():
354
+
355
+ chosen_model = gr.Dropdown(
356
+ label = "ιΈζ“‡ιŸ³ζ¨‚ζ¨‘εž‹",
357
+ choices = [
358
+ "MAGNet",
359
+ "AudioLDM-2",
360
+ "Riffusion",
361
+ "Mustango",
362
+ "MusicGen",
363
+ "Stable Audio Open"
364
+ ],
365
+ value = None,
366
+ filterable = False
367
+ )
368
+
369
+ check_status = gr.Textbox(
370
+ label="APIζ˜―ε¦ε―η”¨",
371
+ interactive=False
372
+ )
373
+
374
+ submit_btn = gr.Button("ε°‡εœ–η‰‡θ½‰ζ›ζˆιŸ³ζ¨‚")
375
+
376
+ gr.Examples(
377
+ examples = [
378
+ ["examples/ocean_poet.jpeg"],
379
+ ["examples/jasper_horace.jpeg"],
380
+ ["examples/summer.jpeg"],
381
+ ["examples/mona_diner.png"],
382
+ ["examples/monalisa.png"],
383
+ ["examples/santa.png"],
384
+ ["examples/winter_hiking.png"],
385
+ ["examples/teatime.jpeg"],
386
+ ["examples/news_experts.jpeg"]
387
+ ],
388
+ fn = infer,
389
+ inputs = [image_in, chosen_model],
390
+ examples_per_page = 4
391
+ )
392
+
393
+ with gr.Column():
394
+
395
+ caption = gr.Textbox(
396
+ label = "εœ–η‰‡ι¦–ε…ˆη”’η”Ÿηš„ζ–‡ε­—",
397
+ interactive = False,
398
+ elem_id = "inspi-prompt"
399
+ )
400
+
401
+ retry_btn = gr.Button("ζ›΄ζ”Ήζ–‡ε­—ι‡ζ–°η”’η”Ÿ", visible=False)
402
+
403
+ result = gr.Audio(
404
+ label = "ιŸ³ζ¨‚"
405
+ )
406
+
407
+
408
+ chosen_model.change(
409
+ fn = check_api,
410
+ inputs = chosen_model,
411
+ outputs = check_status,
412
+ queue = False
413
+ )
414
+
415
+ retry_btn.click(
416
+ fn = retry,
417
+ inputs = [chosen_model, caption],
418
+ outputs = [result]
419
+ )
420
+
421
+ submit_btn.click(
422
+ fn = infer,
423
+ inputs = [
424
+ image_in,
425
+ chosen_model,
426
+ check_status
427
+ ],
428
+ outputs =[
429
+ caption,
430
+ retry_btn,
431
+ result
432
+ ]
433
+ )
434
+
435
+ demo.queue(max_size=16).launch(show_api=False, show_error=True, share=True)
examples/blank.md ADDED
File without changes
examples/chicken_adobo.jpeg ADDED
examples/jasper_horace.jpeg ADDED
examples/mona_diner.png ADDED

Git LFS Details

  • SHA256: 8e7ef2c6a51cb4e3fb5e9ec0bf470c1d3927ce1825e9ca6727f5ef5251fe832d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.18 MB
examples/monalisa.png ADDED
examples/news_experts.jpeg ADDED
examples/ocean_poet.jpeg ADDED
examples/santa.png ADDED

Git LFS Details

  • SHA256: 614642f7ba0dba2bd8034de9c4342a162253c87a46ce553f1b04093aebefbcbc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.21 MB
examples/summer.jpeg ADDED
examples/teatime.jpeg ADDED
examples/winter_hiking.png ADDED

Git LFS Details

  • SHA256: 7753941699851eb98989ef3ff713d6328d1aecdce34132c3d9fe8cec813ef3f5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.59 MB
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ accelerate
4
+ moviepy
5
+ spaces