ZeyuXie commited on
Commit
8c1bf05
1 Parent(s): 8f85e3b

Upload 167 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. audioldm/__init__.py +8 -0
  2. audioldm/__main__.py +183 -0
  3. audioldm/__pycache__/__init__.cpython-310.pyc +0 -0
  4. audioldm/__pycache__/__init__.cpython-37.pyc +0 -0
  5. audioldm/__pycache__/__init__.cpython-39.pyc +0 -0
  6. audioldm/__pycache__/ldm.cpython-310.pyc +0 -0
  7. audioldm/__pycache__/ldm.cpython-37.pyc +0 -0
  8. audioldm/__pycache__/ldm.cpython-39.pyc +0 -0
  9. audioldm/__pycache__/pipeline.cpython-310.pyc +0 -0
  10. audioldm/__pycache__/pipeline.cpython-37.pyc +0 -0
  11. audioldm/__pycache__/pipeline.cpython-39.pyc +0 -0
  12. audioldm/__pycache__/utils.cpython-310.pyc +0 -0
  13. audioldm/__pycache__/utils.cpython-37.pyc +0 -0
  14. audioldm/__pycache__/utils.cpython-39.pyc +0 -0
  15. audioldm/audio/__init__.py +2 -0
  16. audioldm/audio/__pycache__/__init__.cpython-310.pyc +0 -0
  17. audioldm/audio/__pycache__/__init__.cpython-37.pyc +0 -0
  18. audioldm/audio/__pycache__/__init__.cpython-39.pyc +0 -0
  19. audioldm/audio/__pycache__/audio_processing.cpython-310.pyc +0 -0
  20. audioldm/audio/__pycache__/audio_processing.cpython-37.pyc +0 -0
  21. audioldm/audio/__pycache__/audio_processing.cpython-39.pyc +0 -0
  22. audioldm/audio/__pycache__/mix.cpython-39.pyc +0 -0
  23. audioldm/audio/__pycache__/stft.cpython-310.pyc +0 -0
  24. audioldm/audio/__pycache__/stft.cpython-37.pyc +0 -0
  25. audioldm/audio/__pycache__/stft.cpython-39.pyc +0 -0
  26. audioldm/audio/__pycache__/tools.cpython-310.pyc +0 -0
  27. audioldm/audio/__pycache__/tools.cpython-37.pyc +0 -0
  28. audioldm/audio/__pycache__/tools.cpython-39.pyc +0 -0
  29. audioldm/audio/__pycache__/torch_tools.cpython-39.pyc +0 -0
  30. audioldm/audio/audio_processing.py +100 -0
  31. audioldm/audio/stft.py +186 -0
  32. audioldm/audio/tools.py +85 -0
  33. audioldm/clap/__init__.py +0 -0
  34. audioldm/clap/__pycache__/__init__.cpython-39.pyc +0 -0
  35. audioldm/clap/__pycache__/encoders.cpython-39.pyc +0 -0
  36. audioldm/clap/encoders.py +170 -0
  37. audioldm/clap/open_clip/__init__.py +25 -0
  38. audioldm/clap/open_clip/__pycache__/__init__.cpython-39.pyc +0 -0
  39. audioldm/clap/open_clip/__pycache__/factory.cpython-39.pyc +0 -0
  40. audioldm/clap/open_clip/__pycache__/feature_fusion.cpython-39.pyc +0 -0
  41. audioldm/clap/open_clip/__pycache__/htsat.cpython-39.pyc +0 -0
  42. audioldm/clap/open_clip/__pycache__/loss.cpython-39.pyc +0 -0
  43. audioldm/clap/open_clip/__pycache__/model.cpython-39.pyc +0 -0
  44. audioldm/clap/open_clip/__pycache__/openai.cpython-39.pyc +0 -0
  45. audioldm/clap/open_clip/__pycache__/pann_model.cpython-39.pyc +0 -0
  46. audioldm/clap/open_clip/__pycache__/pretrained.cpython-39.pyc +0 -0
  47. audioldm/clap/open_clip/__pycache__/timm_model.cpython-39.pyc +0 -0
  48. audioldm/clap/open_clip/__pycache__/tokenizer.cpython-39.pyc +0 -0
  49. audioldm/clap/open_clip/__pycache__/transform.cpython-39.pyc +0 -0
  50. audioldm/clap/open_clip/__pycache__/utils.cpython-39.pyc +0 -0
audioldm/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .ldm import LatentDiffusion
2
+ from .utils import seed_everything, save_wave, get_time, get_duration
3
+ from .pipeline import *
4
+
5
+
6
+
7
+
8
+
audioldm/__main__.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ import os
3
+ from audioldm import text_to_audio, style_transfer, build_model, save_wave, get_time, round_up_duration, get_duration
4
+ import argparse
5
+
6
+ CACHE_DIR = os.getenv(
7
+ "AUDIOLDM_CACHE_DIR",
8
+ os.path.join(os.path.expanduser("~"), ".cache/audioldm"))
9
+
10
+ parser = argparse.ArgumentParser()
11
+
12
+ parser.add_argument(
13
+ "--mode",
14
+ type=str,
15
+ required=False,
16
+ default="generation",
17
+ help="generation: text-to-audio generation; transfer: style transfer",
18
+ choices=["generation", "transfer"]
19
+ )
20
+
21
+ parser.add_argument(
22
+ "-t",
23
+ "--text",
24
+ type=str,
25
+ required=False,
26
+ default="",
27
+ help="Text prompt to the model for audio generation",
28
+ )
29
+
30
+ parser.add_argument(
31
+ "-f",
32
+ "--file_path",
33
+ type=str,
34
+ required=False,
35
+ default=None,
36
+ help="(--mode transfer): Original audio file for style transfer; Or (--mode generation): the guidance audio file for generating simialr audio",
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--transfer_strength",
41
+ type=float,
42
+ required=False,
43
+ default=0.5,
44
+ help="A value between 0 and 1. 0 means original audio without transfer, 1 means completely transfer to the audio indicated by text",
45
+ )
46
+
47
+ parser.add_argument(
48
+ "-s",
49
+ "--save_path",
50
+ type=str,
51
+ required=False,
52
+ help="The path to save model output",
53
+ default="./output",
54
+ )
55
+
56
+ parser.add_argument(
57
+ "--model_name",
58
+ type=str,
59
+ required=False,
60
+ help="The checkpoint you gonna use",
61
+ default="audioldm-s-full",
62
+ choices=["audioldm-s-full", "audioldm-l-full", "audioldm-s-full-v2"]
63
+ )
64
+
65
+ parser.add_argument(
66
+ "-ckpt",
67
+ "--ckpt_path",
68
+ type=str,
69
+ required=False,
70
+ help="The path to the pretrained .ckpt model",
71
+ default=None,
72
+ )
73
+
74
+ parser.add_argument(
75
+ "-b",
76
+ "--batchsize",
77
+ type=int,
78
+ required=False,
79
+ default=1,
80
+ help="Generate how many samples at the same time",
81
+ )
82
+
83
+ parser.add_argument(
84
+ "--ddim_steps",
85
+ type=int,
86
+ required=False,
87
+ default=200,
88
+ help="The sampling step for DDIM",
89
+ )
90
+
91
+ parser.add_argument(
92
+ "-gs",
93
+ "--guidance_scale",
94
+ type=float,
95
+ required=False,
96
+ default=2.5,
97
+ help="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)",
98
+ )
99
+
100
+ parser.add_argument(
101
+ "-dur",
102
+ "--duration",
103
+ type=float,
104
+ required=False,
105
+ default=10.0,
106
+ help="The duration of the samples",
107
+ )
108
+
109
+ parser.add_argument(
110
+ "-n",
111
+ "--n_candidate_gen_per_text",
112
+ type=int,
113
+ required=False,
114
+ default=3,
115
+ help="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
116
+ )
117
+
118
+ parser.add_argument(
119
+ "--seed",
120
+ type=int,
121
+ required=False,
122
+ default=42,
123
+ help="Change this value (any integer number) will lead to a different generation result.",
124
+ )
125
+
126
+ args = parser.parse_args()
127
+
128
+ if(args.ckpt_path is not None):
129
+ print("Warning: ckpt_path has no effect after version 0.0.20.")
130
+
131
+ assert args.duration % 2.5 == 0, "Duration must be a multiple of 2.5"
132
+
133
+ mode = args.mode
134
+ if(mode == "generation" and args.file_path is not None):
135
+ mode = "generation_audio_to_audio"
136
+ if(len(args.text) > 0):
137
+ print("Warning: You have specified the --file_path. --text will be ignored")
138
+ args.text = ""
139
+
140
+ save_path = os.path.join(args.save_path, mode)
141
+
142
+ if(args.file_path is not None):
143
+ save_path = os.path.join(save_path, os.path.basename(args.file_path.split(".")[0]))
144
+
145
+ text = args.text
146
+ random_seed = args.seed
147
+ duration = args.duration
148
+ guidance_scale = args.guidance_scale
149
+ n_candidate_gen_per_text = args.n_candidate_gen_per_text
150
+
151
+ os.makedirs(save_path, exist_ok=True)
152
+ audioldm = build_model(model_name=args.model_name)
153
+
154
+ if(args.mode == "generation"):
155
+ waveform = text_to_audio(
156
+ audioldm,
157
+ text,
158
+ args.file_path,
159
+ random_seed,
160
+ duration=duration,
161
+ guidance_scale=guidance_scale,
162
+ ddim_steps=args.ddim_steps,
163
+ n_candidate_gen_per_text=n_candidate_gen_per_text,
164
+ batchsize=args.batchsize,
165
+ )
166
+
167
+ elif(args.mode == "transfer"):
168
+ assert args.file_path is not None
169
+ assert os.path.exists(args.file_path), "The original audio file \'%s\' for style transfer does not exist." % args.file_path
170
+ waveform = style_transfer(
171
+ audioldm,
172
+ text,
173
+ args.file_path,
174
+ args.transfer_strength,
175
+ random_seed,
176
+ duration=duration,
177
+ guidance_scale=guidance_scale,
178
+ ddim_steps=args.ddim_steps,
179
+ batchsize=args.batchsize,
180
+ )
181
+ waveform = waveform[:,None,:]
182
+
183
+ save_wave(waveform, save_path, name="%s_%s" % (get_time(), text))
audioldm/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (376 Bytes). View file
 
audioldm/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (290 Bytes). View file
 
audioldm/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (322 Bytes). View file
 
audioldm/__pycache__/ldm.cpython-310.pyc ADDED
Binary file (16.1 kB). View file
 
audioldm/__pycache__/ldm.cpython-37.pyc ADDED
Binary file (16 kB). View file
 
audioldm/__pycache__/ldm.cpython-39.pyc ADDED
Binary file (16 kB). View file
 
audioldm/__pycache__/pipeline.cpython-310.pyc ADDED
Binary file (6.69 kB). View file
 
audioldm/__pycache__/pipeline.cpython-37.pyc ADDED
Binary file (6.41 kB). View file
 
audioldm/__pycache__/pipeline.cpython-39.pyc ADDED
Binary file (6.54 kB). View file
 
audioldm/__pycache__/utils.cpython-310.pyc ADDED
Binary file (8.07 kB). View file
 
audioldm/__pycache__/utils.cpython-37.pyc ADDED
Binary file (7.65 kB). View file
 
audioldm/__pycache__/utils.cpython-39.pyc ADDED
Binary file (7.35 kB). View file
 
audioldm/audio/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .tools import wav_to_fbank, read_wav_file
2
+ from .stft import TacotronSTFT
audioldm/audio/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (314 Bytes). View file
 
audioldm/audio/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (228 Bytes). View file
 
audioldm/audio/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (260 Bytes). View file
 
audioldm/audio/__pycache__/audio_processing.cpython-310.pyc ADDED
Binary file (2.84 kB). View file
 
audioldm/audio/__pycache__/audio_processing.cpython-37.pyc ADDED
Binary file (2.74 kB). View file
 
audioldm/audio/__pycache__/audio_processing.cpython-39.pyc ADDED
Binary file (2.78 kB). View file
 
audioldm/audio/__pycache__/mix.cpython-39.pyc ADDED
Binary file (1.7 kB). View file
 
audioldm/audio/__pycache__/stft.cpython-310.pyc ADDED
Binary file (5.08 kB). View file
 
audioldm/audio/__pycache__/stft.cpython-37.pyc ADDED
Binary file (4.97 kB). View file
 
audioldm/audio/__pycache__/stft.cpython-39.pyc ADDED
Binary file (4.99 kB). View file
 
audioldm/audio/__pycache__/tools.cpython-310.pyc ADDED
Binary file (2.25 kB). View file
 
audioldm/audio/__pycache__/tools.cpython-37.pyc ADDED
Binary file (2.16 kB). View file
 
audioldm/audio/__pycache__/tools.cpython-39.pyc ADDED
Binary file (2.19 kB). View file
 
audioldm/audio/__pycache__/torch_tools.cpython-39.pyc ADDED
Binary file (3.79 kB). View file
 
audioldm/audio/audio_processing.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import librosa.util as librosa_util
4
+ from scipy.signal import get_window
5
+
6
+
7
+ def window_sumsquare(
8
+ window,
9
+ n_frames,
10
+ hop_length,
11
+ win_length,
12
+ n_fft,
13
+ dtype=np.float32,
14
+ norm=None,
15
+ ):
16
+ """
17
+ # from librosa 0.6
18
+ Compute the sum-square envelope of a window function at a given hop length.
19
+
20
+ This is used to estimate modulation effects induced by windowing
21
+ observations in short-time fourier transforms.
22
+
23
+ Parameters
24
+ ----------
25
+ window : string, tuple, number, callable, or list-like
26
+ Window specification, as in `get_window`
27
+
28
+ n_frames : int > 0
29
+ The number of analysis frames
30
+
31
+ hop_length : int > 0
32
+ The number of samples to advance between frames
33
+
34
+ win_length : [optional]
35
+ The length of the window function. By default, this matches `n_fft`.
36
+
37
+ n_fft : int > 0
38
+ The length of each analysis frame.
39
+
40
+ dtype : np.dtype
41
+ The data type of the output
42
+
43
+ Returns
44
+ -------
45
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
46
+ The sum-squared envelope of the window function
47
+ """
48
+ if win_length is None:
49
+ win_length = n_fft
50
+
51
+ n = n_fft + hop_length * (n_frames - 1)
52
+ x = np.zeros(n, dtype=dtype)
53
+
54
+ # Compute the squared window at the desired length
55
+ win_sq = get_window(window, win_length, fftbins=True)
56
+ win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
57
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
58
+
59
+ # Fill the envelope
60
+ for i in range(n_frames):
61
+ sample = i * hop_length
62
+ x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
63
+ return x
64
+
65
+
66
+ def griffin_lim(magnitudes, stft_fn, n_iters=30):
67
+ """
68
+ PARAMS
69
+ ------
70
+ magnitudes: spectrogram magnitudes
71
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
72
+ """
73
+
74
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
75
+ angles = angles.astype(np.float32)
76
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
77
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
78
+
79
+ for i in range(n_iters):
80
+ _, angles = stft_fn.transform(signal)
81
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
82
+ return signal
83
+
84
+
85
+ def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
86
+ """
87
+ PARAMS
88
+ ------
89
+ C: compression factor
90
+ """
91
+ return normalize_fun(torch.clamp(x, min=clip_val) * C)
92
+
93
+
94
+ def dynamic_range_decompression(x, C=1):
95
+ """
96
+ PARAMS
97
+ ------
98
+ C: compression factor used to compress
99
+ """
100
+ return torch.exp(x) / C
audioldm/audio/stft.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import numpy as np
4
+ from scipy.signal import get_window
5
+ from librosa.util import pad_center, tiny
6
+ from librosa.filters import mel as librosa_mel_fn
7
+
8
+ from audioldm.audio.audio_processing import (
9
+ dynamic_range_compression,
10
+ dynamic_range_decompression,
11
+ window_sumsquare,
12
+ )
13
+
14
+
15
+ class STFT(torch.nn.Module):
16
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
17
+
18
+ def __init__(self, filter_length, hop_length, win_length, window="hann"):
19
+ super(STFT, self).__init__()
20
+ self.filter_length = filter_length
21
+ self.hop_length = hop_length
22
+ self.win_length = win_length
23
+ self.window = window
24
+ self.forward_transform = None
25
+ scale = self.filter_length / self.hop_length
26
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
27
+
28
+ cutoff = int((self.filter_length / 2 + 1))
29
+ fourier_basis = np.vstack(
30
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
31
+ )
32
+
33
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
34
+ inverse_basis = torch.FloatTensor(
35
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :]
36
+ )
37
+
38
+ if window is not None:
39
+ assert filter_length >= win_length
40
+ # get window and zero center pad it to filter_length
41
+ fft_window = get_window(window, win_length, fftbins=True)
42
+ fft_window = pad_center(fft_window, size=filter_length)
43
+ fft_window = torch.from_numpy(fft_window).float()
44
+
45
+ # window the bases
46
+ forward_basis *= fft_window
47
+ inverse_basis *= fft_window
48
+
49
+ self.register_buffer("forward_basis", forward_basis.float())
50
+ self.register_buffer("inverse_basis", inverse_basis.float())
51
+
52
+ def transform(self, input_data):
53
+ device = self.forward_basis.device
54
+ input_data = input_data.to(device)
55
+
56
+ num_batches = input_data.size(0)
57
+ num_samples = input_data.size(1)
58
+
59
+ self.num_samples = num_samples
60
+
61
+ # similar to librosa, reflect-pad the input
62
+ input_data = input_data.view(num_batches, 1, num_samples)
63
+ input_data = F.pad(
64
+ input_data.unsqueeze(1),
65
+ (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
66
+ mode="reflect",
67
+ )
68
+ input_data = input_data.squeeze(1)
69
+
70
+ forward_transform = F.conv1d(
71
+ input_data,
72
+ torch.autograd.Variable(self.forward_basis, requires_grad=False),
73
+ stride=self.hop_length,
74
+ padding=0,
75
+ )#.cpu()
76
+
77
+ cutoff = int((self.filter_length / 2) + 1)
78
+ real_part = forward_transform[:, :cutoff, :]
79
+ imag_part = forward_transform[:, cutoff:, :]
80
+
81
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
82
+ phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
83
+
84
+ return magnitude, phase
85
+
86
+ def inverse(self, magnitude, phase):
87
+ device = self.forward_basis.device
88
+ magnitude, phase = magnitude.to(device), phase.to(device)
89
+
90
+ recombine_magnitude_phase = torch.cat(
91
+ [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
92
+ )
93
+
94
+ inverse_transform = F.conv_transpose1d(
95
+ recombine_magnitude_phase,
96
+ torch.autograd.Variable(self.inverse_basis, requires_grad=False),
97
+ stride=self.hop_length,
98
+ padding=0,
99
+ )
100
+
101
+ if self.window is not None:
102
+ window_sum = window_sumsquare(
103
+ self.window,
104
+ magnitude.size(-1),
105
+ hop_length=self.hop_length,
106
+ win_length=self.win_length,
107
+ n_fft=self.filter_length,
108
+ dtype=np.float32,
109
+ )
110
+ # remove modulation effects
111
+ approx_nonzero_indices = torch.from_numpy(
112
+ np.where(window_sum > tiny(window_sum))[0]
113
+ )
114
+ window_sum = torch.autograd.Variable(
115
+ torch.from_numpy(window_sum), requires_grad=False
116
+ )
117
+ window_sum = window_sum
118
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
119
+ approx_nonzero_indices
120
+ ]
121
+
122
+ # scale by hop ratio
123
+ inverse_transform *= float(self.filter_length) / self.hop_length
124
+
125
+ inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
126
+ inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
127
+
128
+ return inverse_transform
129
+
130
+ def forward(self, input_data):
131
+ self.magnitude, self.phase = self.transform(input_data)
132
+ reconstruction = self.inverse(self.magnitude, self.phase)
133
+ return reconstruction
134
+
135
+
136
+ class TacotronSTFT(torch.nn.Module):
137
+ def __init__(
138
+ self,
139
+ filter_length,
140
+ hop_length,
141
+ win_length,
142
+ n_mel_channels,
143
+ sampling_rate,
144
+ mel_fmin,
145
+ mel_fmax,
146
+ ):
147
+ super(TacotronSTFT, self).__init__()
148
+ self.n_mel_channels = n_mel_channels
149
+ self.sampling_rate = sampling_rate
150
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
151
+ mel_basis = librosa_mel_fn(
152
+ sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax
153
+ )
154
+ mel_basis = torch.from_numpy(mel_basis).float()
155
+ self.register_buffer("mel_basis", mel_basis)
156
+
157
+ def spectral_normalize(self, magnitudes, normalize_fun):
158
+ output = dynamic_range_compression(magnitudes, normalize_fun)
159
+ return output
160
+
161
+ def spectral_de_normalize(self, magnitudes):
162
+ output = dynamic_range_decompression(magnitudes)
163
+ return output
164
+
165
+ def mel_spectrogram(self, y, normalize_fun=torch.log):
166
+ """Computes mel-spectrograms from a batch of waves
167
+ PARAMS
168
+ ------
169
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
170
+
171
+ RETURNS
172
+ -------
173
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
174
+ """
175
+ assert torch.min(y.data) >= -1, torch.min(y.data)
176
+ assert torch.max(y.data) <= 1, torch.max(y.data)
177
+
178
+ magnitudes, phases = self.stft_fn.transform(y)
179
+ magnitudes = magnitudes.data
180
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
181
+ mel_output = self.spectral_normalize(mel_output, normalize_fun)
182
+ energy = torch.norm(magnitudes, dim=1)
183
+
184
+ log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
185
+
186
+ return mel_output, log_magnitudes, energy
audioldm/audio/tools.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import torchaudio
4
+
5
+
6
+ def get_mel_from_wav(audio, _stft):
7
+ audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
8
+ audio = torch.autograd.Variable(audio, requires_grad=False)
9
+ melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
10
+ melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
11
+ log_magnitudes_stft = (
12
+ torch.squeeze(log_magnitudes_stft, 0).numpy().astype(np.float32)
13
+ )
14
+ energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
15
+ return melspec, log_magnitudes_stft, energy
16
+
17
+
18
+ def _pad_spec(fbank, target_length=1024):
19
+ n_frames = fbank.shape[0]
20
+ p = target_length - n_frames
21
+ # cut and pad
22
+ if p > 0:
23
+ m = torch.nn.ZeroPad2d((0, 0, 0, p))
24
+ fbank = m(fbank)
25
+ elif p < 0:
26
+ fbank = fbank[0:target_length, :]
27
+
28
+ if fbank.size(-1) % 2 != 0:
29
+ fbank = fbank[..., :-1]
30
+
31
+ return fbank
32
+
33
+
34
+ def pad_wav(waveform, segment_length):
35
+ waveform_length = waveform.shape[-1]
36
+ assert waveform_length > 100, "Waveform is too short, %s" % waveform_length
37
+ if segment_length is None or waveform_length == segment_length:
38
+ return waveform
39
+ elif waveform_length > segment_length:
40
+ return waveform[:segment_length]
41
+ elif waveform_length < segment_length:
42
+ temp_wav = np.zeros((1, segment_length))
43
+ temp_wav[:, :waveform_length] = waveform
44
+ return temp_wav
45
+
46
+ def normalize_wav(waveform):
47
+ waveform = waveform - np.mean(waveform)
48
+ waveform = waveform / (np.max(np.abs(waveform)) + 1e-8)
49
+ return waveform * 0.5
50
+
51
+
52
+ def read_wav_file(filename, segment_length):
53
+ # waveform, sr = librosa.load(filename, sr=None, mono=True) # 4 times slower
54
+ waveform, sr = torchaudio.load(filename) # Faster!!!
55
+ waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)
56
+ waveform = waveform.numpy()[0, ...]
57
+ waveform = normalize_wav(waveform)
58
+ waveform = waveform[None, ...]
59
+ waveform = pad_wav(waveform, segment_length)
60
+
61
+ waveform = waveform / np.max(np.abs(waveform))
62
+ waveform = 0.5 * waveform
63
+
64
+ return waveform
65
+
66
+
67
+ def wav_to_fbank(filename, target_length=1024, fn_STFT=None):
68
+ assert fn_STFT is not None
69
+
70
+ # mixup
71
+ waveform = read_wav_file(filename, target_length * 160) # hop size is 160
72
+
73
+ waveform = waveform[0, ...]
74
+ waveform = torch.FloatTensor(waveform)
75
+
76
+ fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
77
+
78
+ fbank = torch.FloatTensor(fbank.T)
79
+ log_magnitudes_stft = torch.FloatTensor(log_magnitudes_stft.T)
80
+
81
+ fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
82
+ log_magnitudes_stft, target_length
83
+ )
84
+
85
+ return fbank, log_magnitudes_stft, waveform
audioldm/clap/__init__.py ADDED
File without changes
audioldm/clap/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (152 Bytes). View file
 
audioldm/clap/__pycache__/encoders.cpython-39.pyc ADDED
Binary file (5.1 kB). View file
 
audioldm/clap/encoders.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from audioldm.clap.open_clip import create_model
4
+ from audioldm.clap.training.data import get_audio_features
5
+ import torchaudio
6
+ from transformers import RobertaTokenizer
7
+ import torch.nn.functional as F
8
+
9
+
10
+ class CLAPAudioEmbeddingClassifierFreev2(nn.Module):
11
+ def __init__(
12
+ self,
13
+ pretrained_path="",
14
+ key="class",
15
+ sampling_rate=16000,
16
+ embed_mode="audio",
17
+ amodel = "HTSAT-tiny",
18
+ unconditional_prob=0.1,
19
+ random_mute=False,
20
+ max_random_mute_portion=0.5,
21
+ training_mode=True,
22
+ ):
23
+ super().__init__()
24
+
25
+ self.key = key
26
+ self.device = "cpu"
27
+ self.precision = "fp32"
28
+ self.amodel = amodel # or 'PANN-14'
29
+ self.tmodel = "roberta" # the best text encoder in our training
30
+ self.enable_fusion = False # False if you do not want to use the fusion model
31
+ self.fusion_type = "aff_2d"
32
+ self.pretrained = pretrained_path
33
+ self.embed_mode = embed_mode
34
+ self.embed_mode_orig = embed_mode
35
+ self.sampling_rate = sampling_rate
36
+ self.unconditional_prob = unconditional_prob
37
+ self.random_mute = random_mute
38
+ self.tokenize = RobertaTokenizer.from_pretrained("roberta-base")
39
+ self.max_random_mute_portion = max_random_mute_portion
40
+ self.training_mode = training_mode
41
+ self.model, self.model_cfg = create_model(
42
+ self.amodel,
43
+ self.tmodel,
44
+ self.pretrained,
45
+ precision=self.precision,
46
+ device=self.device,
47
+ enable_fusion=self.enable_fusion,
48
+ fusion_type=self.fusion_type,
49
+ )
50
+ for p in self.model.parameters():
51
+ p.requires_grad = False
52
+
53
+ self.model.eval()
54
+
55
+ def get_unconditional_condition(self, batchsize):
56
+ self.unconditional_token = self.model.get_text_embedding(
57
+ self.tokenizer(["", ""])
58
+ )[0:1]
59
+ return torch.cat([self.unconditional_token.unsqueeze(0)] * batchsize, dim=0)
60
+
61
+ def batch_to_list(self, batch):
62
+ ret = []
63
+ for i in range(batch.size(0)):
64
+ ret.append(batch[i])
65
+ return ret
66
+
67
+ def make_decision(self, probability):
68
+ if float(torch.rand(1)) < probability:
69
+ return True
70
+ else:
71
+ return False
72
+
73
+ def random_uniform(self, start, end):
74
+ val = torch.rand(1).item()
75
+ return start + (end - start) * val
76
+
77
+ def _random_mute(self, waveform):
78
+ # waveform: [bs, t-steps]
79
+ t_steps = waveform.size(-1)
80
+ for i in range(waveform.size(0)):
81
+ mute_size = int(
82
+ self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion))
83
+ )
84
+ mute_start = int(self.random_uniform(0, t_steps - mute_size))
85
+ waveform[i, mute_start : mute_start + mute_size] = 0
86
+ return waveform
87
+
88
+ def cos_similarity(self, waveform, text):
89
+ # waveform: [bs, t_steps]
90
+ with torch.no_grad():
91
+ self.embed_mode = "audio"
92
+ audio_emb = self(waveform.cuda())
93
+ self.embed_mode = "text"
94
+ text_emb = self(text)
95
+ similarity = F.cosine_similarity(audio_emb, text_emb, dim=2), audio_emb, text_emb
96
+ return similarity.squeeze()
97
+
98
+ def forward(self, batch, key=None):
99
+ # If you want this conditioner to be unconditional, set self.unconditional_prob = 1.0
100
+ # If you want this conditioner to be fully conditional, set self.unconditional_prob = 0.0
101
+ if self.model.training == True and not self.training_mode:
102
+ print(
103
+ "The pretrained CLAP model should always be in eval mode. Reloading model just in case you change the parameters."
104
+ )
105
+ self.model, self.model_cfg = create_model(
106
+ self.amodel,
107
+ self.tmodel,
108
+ self.pretrained,
109
+ precision=self.precision,
110
+ device="cuda",
111
+ enable_fusion=self.enable_fusion,
112
+ fusion_type=self.fusion_type,
113
+ )
114
+ for p in self.model.parameters():
115
+ p.requires_grad = False
116
+ self.model.eval()
117
+
118
+ # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
119
+ if self.embed_mode == "audio":
120
+ with torch.no_grad():
121
+ audio_dict_list = []
122
+ assert (
123
+ self.sampling_rate == 16000
124
+ ), "We only support 16000 sampling rate"
125
+ if self.random_mute:
126
+ batch = self._random_mute(batch)
127
+ # batch: [bs, 1, t-samples]
128
+ batch = torchaudio.functional.resample(
129
+ batch, orig_freq=self.sampling_rate, new_freq=48000
130
+ )
131
+ for waveform in self.batch_to_list(batch):
132
+ audio_dict = {}
133
+ audio_dict = get_audio_features(
134
+ audio_dict,
135
+ waveform,
136
+ 480000,
137
+ data_truncating="fusion",
138
+ data_filling="repeatpad",
139
+ audio_cfg=self.model_cfg["audio_cfg"],
140
+ )
141
+ audio_dict_list.append(audio_dict)
142
+ # [bs, 512]
143
+ embed = self.model.get_audio_embedding(audio_dict_list)
144
+ elif self.embed_mode == "text":
145
+ with torch.no_grad():
146
+ # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
147
+ text_data = self.tokenizer(batch)
148
+ embed = self.model.get_text_embedding(text_data)
149
+
150
+ embed = embed.unsqueeze(1)
151
+ self.unconditional_token = self.model.get_text_embedding(
152
+ self.tokenizer(["", ""])
153
+ )[0:1]
154
+
155
+ for i in range(embed.size(0)):
156
+ if self.make_decision(self.unconditional_prob):
157
+ embed[i] = self.unconditional_token
158
+
159
+ # [bs, 1, 512]
160
+ return embed.detach()
161
+
162
+ def tokenizer(self, text):
163
+ result = self.tokenize(
164
+ text,
165
+ padding="max_length",
166
+ truncation=True,
167
+ max_length=512,
168
+ return_tensors="pt",
169
+ )
170
+ return {k: v.squeeze(0) for k, v in result.items()}
audioldm/clap/open_clip/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .factory import (
2
+ list_models,
3
+ create_model,
4
+ create_model_and_transforms,
5
+ add_model_config,
6
+ )
7
+ from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
8
+ from .model import (
9
+ CLAP,
10
+ CLAPTextCfg,
11
+ CLAPVisionCfg,
12
+ CLAPAudioCfp,
13
+ convert_weights_to_fp16,
14
+ trace_model,
15
+ )
16
+ from .openai import load_openai_model, list_openai_models
17
+ from .pretrained import (
18
+ list_pretrained,
19
+ list_pretrained_tag_models,
20
+ list_pretrained_model_tags,
21
+ get_pretrained_url,
22
+ download_pretrained,
23
+ )
24
+ from .tokenizer import SimpleTokenizer, tokenize
25
+ from .transform import image_transform
audioldm/clap/open_clip/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (958 Bytes). View file
 
audioldm/clap/open_clip/__pycache__/factory.cpython-39.pyc ADDED
Binary file (6.67 kB). View file
 
audioldm/clap/open_clip/__pycache__/feature_fusion.cpython-39.pyc ADDED
Binary file (4.17 kB). View file
 
audioldm/clap/open_clip/__pycache__/htsat.cpython-39.pyc ADDED
Binary file (30.8 kB). View file
 
audioldm/clap/open_clip/__pycache__/loss.cpython-39.pyc ADDED
Binary file (8.06 kB). View file
 
audioldm/clap/open_clip/__pycache__/model.cpython-39.pyc ADDED
Binary file (23.8 kB). View file
 
audioldm/clap/open_clip/__pycache__/openai.cpython-39.pyc ADDED
Binary file (4.55 kB). View file
 
audioldm/clap/open_clip/__pycache__/pann_model.cpython-39.pyc ADDED
Binary file (13.3 kB). View file
 
audioldm/clap/open_clip/__pycache__/pretrained.cpython-39.pyc ADDED
Binary file (5.09 kB). View file
 
audioldm/clap/open_clip/__pycache__/timm_model.cpython-39.pyc ADDED
Binary file (3.4 kB). View file
 
audioldm/clap/open_clip/__pycache__/tokenizer.cpython-39.pyc ADDED
Binary file (7.42 kB). View file
 
audioldm/clap/open_clip/__pycache__/transform.cpython-39.pyc ADDED
Binary file (974 Bytes). View file
 
audioldm/clap/open_clip/__pycache__/utils.cpython-39.pyc ADDED
Binary file (9.83 kB). View file