zeeshanrafiqrana commited on
Commit
3280d50
1 Parent(s): bb702e0

push video-vbackend

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. CHANGELOG.md +28 -0
  2. CODE_OF_CONDUCT.md +80 -0
  3. CONTRIBUTING.md +35 -0
  4. LICENSE +21 -0
  5. LICENSE_weights +399 -0
  6. MANIFEST.in +9 -0
  7. Makefile +40 -0
  8. README.md +7 -8
  9. app.py +422 -0
  10. assets/a_duck_quacking_as_birds_chirp_and_a_pigeon_cooing.mp3 +0 -0
  11. assets/bach.mp3 +0 -0
  12. assets/bolero_ravel.mp3 +0 -0
  13. assets/sirens_and_a_humming_engine_approach_and_pass.mp3 +0 -0
  14. audiocraft/__init__.py +26 -0
  15. audiocraft/adversarial/__init__.py +22 -0
  16. audiocraft/adversarial/discriminators/__init__.py +10 -0
  17. audiocraft/adversarial/discriminators/base.py +34 -0
  18. audiocraft/adversarial/discriminators/mpd.py +106 -0
  19. audiocraft/adversarial/discriminators/msd.py +126 -0
  20. audiocraft/adversarial/discriminators/msstftd.py +134 -0
  21. audiocraft/adversarial/losses.py +228 -0
  22. audiocraft/data/__init__.py +10 -0
  23. audiocraft/data/audio.py +216 -0
  24. audiocraft/data/audio_dataset.py +587 -0
  25. audiocraft/data/audio_utils.py +176 -0
  26. audiocraft/data/info_audio_dataset.py +110 -0
  27. audiocraft/data/music_dataset.py +270 -0
  28. audiocraft/data/sound_dataset.py +330 -0
  29. audiocraft/data/zip.py +76 -0
  30. audiocraft/environment.py +176 -0
  31. audiocraft/grids/__init__.py +6 -0
  32. audiocraft/grids/_base_explorers.py +80 -0
  33. audiocraft/grids/audiogen/__init__.py +6 -0
  34. audiocraft/grids/audiogen/audiogen_base_16khz.py +23 -0
  35. audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py +68 -0
  36. audiocraft/grids/compression/__init__.py +6 -0
  37. audiocraft/grids/compression/_explorers.py +55 -0
  38. audiocraft/grids/compression/debug.py +31 -0
  39. audiocraft/grids/compression/encodec_audiogen_16khz.py +29 -0
  40. audiocraft/grids/compression/encodec_base_24khz.py +28 -0
  41. audiocraft/grids/compression/encodec_musicgen_32khz.py +34 -0
  42. audiocraft/grids/diffusion/4_bands_base_32khz.py +27 -0
  43. audiocraft/grids/diffusion/__init__.py +6 -0
  44. audiocraft/grids/diffusion/_explorers.py +66 -0
  45. audiocraft/grids/musicgen/__init__.py +6 -0
  46. audiocraft/grids/musicgen/_explorers.py +93 -0
  47. audiocraft/grids/musicgen/musicgen_base_32khz.py +43 -0
  48. audiocraft/grids/musicgen/musicgen_base_cached_32khz.py +67 -0
  49. audiocraft/grids/musicgen/musicgen_clapemb_32khz.py +32 -0
  50. audiocraft/grids/musicgen/musicgen_melody_32khz.py +65 -0
CHANGELOG.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
6
+
7
+ ## [1.0.0] - 2023-08-02
8
+
9
+ Major revision, added training code for EnCodec, AudioGen, MusicGen, and MultiBandDiffusion.
10
+ Added pretrained model for AudioGen and MultiBandDiffusion.
11
+
12
+ ## [0.0.2] - 2023-08-01
13
+
14
+ Improved demo, fixed top p (thanks @jnordberg).
15
+
16
+ Compressor tanh on output to avoid clipping with some style (especially piano).
17
+ Now repeating the conditioning periodically if it is too short.
18
+
19
+ More options when launching Gradio app locally (thanks @ashleykleynhans).
20
+
21
+ Testing out PyTorch 2.0 memory efficient attention.
22
+
23
+ Added extended generation (infinite length) by slowly moving the windows.
24
+ Note that other implementations exist: https://github.com/camenduru/MusicGen-colab.
25
+
26
+ ## [0.0.1] - 2023-06-09
27
+
28
+ Initial release, with model evaluation only.
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ This Code of Conduct also applies outside the project spaces when there is a
56
+ reasonable belief that an individual's behavior may have a negative impact on
57
+ the project or its community.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported by contacting the project team at <[email protected]>. All
63
+ complaints will be reviewed and investigated and will result in a response that
64
+ is deemed necessary and appropriate to the circumstances. The project team is
65
+ obligated to maintain confidentiality with regard to the reporter of an incident.
66
+ Further details of specific enforcement policies may be posted separately.
67
+
68
+ Project maintainers who do not follow or enforce the Code of Conduct in good
69
+ faith may face temporary or permanent repercussions as determined by other
70
+ members of the project's leadership.
71
+
72
+ ## Attribution
73
+
74
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76
+
77
+ [homepage]: https://www.contributor-covenant.org
78
+
79
+ For answers to common questions about this code of conduct, see
80
+ https://www.contributor-covenant.org/faq
CONTRIBUTING.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to AudioCraft
2
+
3
+ We want to make contributing to this project as easy and transparent as
4
+ possible.
5
+
6
+ ## Pull Requests
7
+
8
+ AudioCraft is the implementation of a research paper.
9
+ Therefore, we do not plan on accepting many pull requests for new features.
10
+ We certainly welcome them for bug fixes.
11
+
12
+ 1. Fork the repo and create your branch from `main`.
13
+ 2. If you've added code that should be tested, add tests.
14
+ 3. If you've changed APIs, update the documentation.
15
+ 4. Ensure the test suite passes.
16
+ 5. Make sure your code lints.
17
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
18
+
19
+ ## Contributor License Agreement ("CLA")
20
+ In order to accept your pull request, we need you to submit a CLA. You only need
21
+ to do this once to work on any of Meta's open source projects.
22
+
23
+ Complete your CLA here: <https://code.facebook.com/cla>
24
+
25
+ ## Issues
26
+ We use GitHub issues to track public bugs. Please ensure your description is
27
+ clear and has sufficient instructions to be able to reproduce the issue.
28
+
29
+ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
30
+ disclosure of security bugs. In those cases, please go through the process
31
+ outlined on that page and do not file a public issue.
32
+
33
+ ## License
34
+ By contributing to encodec, you agree that your contributions will be licensed
35
+ under the LICENSE file in the root directory of this source tree.
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Meta Platforms, Inc. and affiliates.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
LICENSE_weights ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution-NonCommercial 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More_considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-NonCommercial 4.0 International Public
58
+ License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-NonCommercial 4.0 International Public License ("Public
63
+ License"). To the extent this Public License may be interpreted as a
64
+ contract, You are granted the Licensed Rights in consideration of Your
65
+ acceptance of these terms and conditions, and the Licensor grants You
66
+ such rights in consideration of benefits the Licensor receives from
67
+ making the Licensed Material available under these terms and
68
+ conditions.
69
+
70
+ Section 1 -- Definitions.
71
+
72
+ a. Adapted Material means material subject to Copyright and Similar
73
+ Rights that is derived from or based upon the Licensed Material
74
+ and in which the Licensed Material is translated, altered,
75
+ arranged, transformed, or otherwise modified in a manner requiring
76
+ permission under the Copyright and Similar Rights held by the
77
+ Licensor. For purposes of this Public License, where the Licensed
78
+ Material is a musical work, performance, or sound recording,
79
+ Adapted Material is always produced where the Licensed Material is
80
+ synched in timed relation with a moving image.
81
+
82
+ b. Adapter's License means the license You apply to Your Copyright
83
+ and Similar Rights in Your contributions to Adapted Material in
84
+ accordance with the terms and conditions of this Public License.
85
+
86
+ c. Copyright and Similar Rights means copyright and/or similar rights
87
+ closely related to copyright including, without limitation,
88
+ performance, broadcast, sound recording, and Sui Generis Database
89
+ Rights, without regard to how the rights are labeled or
90
+ categorized. For purposes of this Public License, the rights
91
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
92
+ Rights.
93
+ d. Effective Technological Measures means those measures that, in the
94
+ absence of proper authority, may not be circumvented under laws
95
+ fulfilling obligations under Article 11 of the WIPO Copyright
96
+ Treaty adopted on December 20, 1996, and/or similar international
97
+ agreements.
98
+
99
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
100
+ any other exception or limitation to Copyright and Similar Rights
101
+ that applies to Your use of the Licensed Material.
102
+
103
+ f. Licensed Material means the artistic or literary work, database,
104
+ or other material to which the Licensor applied this Public
105
+ License.
106
+
107
+ g. Licensed Rights means the rights granted to You subject to the
108
+ terms and conditions of this Public License, which are limited to
109
+ all Copyright and Similar Rights that apply to Your use of the
110
+ Licensed Material and that the Licensor has authority to license.
111
+
112
+ h. Licensor means the individual(s) or entity(ies) granting rights
113
+ under this Public License.
114
+
115
+ i. NonCommercial means not primarily intended for or directed towards
116
+ commercial advantage or monetary compensation. For purposes of
117
+ this Public License, the exchange of the Licensed Material for
118
+ other material subject to Copyright and Similar Rights by digital
119
+ file-sharing or similar means is NonCommercial provided there is
120
+ no payment of monetary compensation in connection with the
121
+ exchange.
122
+
123
+ j. Share means to provide material to the public by any means or
124
+ process that requires permission under the Licensed Rights, such
125
+ as reproduction, public display, public performance, distribution,
126
+ dissemination, communication, or importation, and to make material
127
+ available to the public including in ways that members of the
128
+ public may access the material from a place and at a time
129
+ individually chosen by them.
130
+
131
+ k. Sui Generis Database Rights means rights other than copyright
132
+ resulting from Directive 96/9/EC of the European Parliament and of
133
+ the Council of 11 March 1996 on the legal protection of databases,
134
+ as amended and/or succeeded, as well as other essentially
135
+ equivalent rights anywhere in the world.
136
+
137
+ l. You means the individual or entity exercising the Licensed Rights
138
+ under this Public License. Your has a corresponding meaning.
139
+
140
+ Section 2 -- Scope.
141
+
142
+ a. License grant.
143
+
144
+ 1. Subject to the terms and conditions of this Public License,
145
+ the Licensor hereby grants You a worldwide, royalty-free,
146
+ non-sublicensable, non-exclusive, irrevocable license to
147
+ exercise the Licensed Rights in the Licensed Material to:
148
+
149
+ a. reproduce and Share the Licensed Material, in whole or
150
+ in part, for NonCommercial purposes only; and
151
+
152
+ b. produce, reproduce, and Share Adapted Material for
153
+ NonCommercial purposes only.
154
+
155
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
156
+ Exceptions and Limitations apply to Your use, this Public
157
+ License does not apply, and You do not need to comply with
158
+ its terms and conditions.
159
+
160
+ 3. Term. The term of this Public License is specified in Section
161
+ 6(a).
162
+
163
+ 4. Media and formats; technical modifications allowed. The
164
+ Licensor authorizes You to exercise the Licensed Rights in
165
+ all media and formats whether now known or hereafter created,
166
+ and to make technical modifications necessary to do so. The
167
+ Licensor waives and/or agrees not to assert any right or
168
+ authority to forbid You from making technical modifications
169
+ necessary to exercise the Licensed Rights, including
170
+ technical modifications necessary to circumvent Effective
171
+ Technological Measures. For purposes of this Public License,
172
+ simply making modifications authorized by this Section 2(a)
173
+ (4) never produces Adapted Material.
174
+
175
+ 5. Downstream recipients.
176
+
177
+ a. Offer from the Licensor -- Licensed Material. Every
178
+ recipient of the Licensed Material automatically
179
+ receives an offer from the Licensor to exercise the
180
+ Licensed Rights under the terms and conditions of this
181
+ Public License.
182
+
183
+ b. No downstream restrictions. You may not offer or impose
184
+ any additional or different terms or conditions on, or
185
+ apply any Effective Technological Measures to, the
186
+ Licensed Material if doing so restricts exercise of the
187
+ Licensed Rights by any recipient of the Licensed
188
+ Material.
189
+
190
+ 6. No endorsement. Nothing in this Public License constitutes or
191
+ may be construed as permission to assert or imply that You
192
+ are, or that Your use of the Licensed Material is, connected
193
+ with, or sponsored, endorsed, or granted official status by,
194
+ the Licensor or others designated to receive attribution as
195
+ provided in Section 3(a)(1)(A)(i).
196
+
197
+ b. Other rights.
198
+
199
+ 1. Moral rights, such as the right of integrity, are not
200
+ licensed under this Public License, nor are publicity,
201
+ privacy, and/or other similar personality rights; however, to
202
+ the extent possible, the Licensor waives and/or agrees not to
203
+ assert any such rights held by the Licensor to the limited
204
+ extent necessary to allow You to exercise the Licensed
205
+ Rights, but not otherwise.
206
+
207
+ 2. Patent and trademark rights are not licensed under this
208
+ Public License.
209
+
210
+ 3. To the extent possible, the Licensor waives any right to
211
+ collect royalties from You for the exercise of the Licensed
212
+ Rights, whether directly or through a collecting society
213
+ under any voluntary or waivable statutory or compulsory
214
+ licensing scheme. In all other cases the Licensor expressly
215
+ reserves any right to collect such royalties, including when
216
+ the Licensed Material is used other than for NonCommercial
217
+ purposes.
218
+
219
+ Section 3 -- License Conditions.
220
+
221
+ Your exercise of the Licensed Rights is expressly made subject to the
222
+ following conditions.
223
+
224
+ a. Attribution.
225
+
226
+ 1. If You Share the Licensed Material (including in modified
227
+ form), You must:
228
+
229
+ a. retain the following if it is supplied by the Licensor
230
+ with the Licensed Material:
231
+
232
+ i. identification of the creator(s) of the Licensed
233
+ Material and any others designated to receive
234
+ attribution, in any reasonable manner requested by
235
+ the Licensor (including by pseudonym if
236
+ designated);
237
+
238
+ ii. a copyright notice;
239
+
240
+ iii. a notice that refers to this Public License;
241
+
242
+ iv. a notice that refers to the disclaimer of
243
+ warranties;
244
+
245
+ v. a URI or hyperlink to the Licensed Material to the
246
+ extent reasonably practicable;
247
+
248
+ b. indicate if You modified the Licensed Material and
249
+ retain an indication of any previous modifications; and
250
+
251
+ c. indicate the Licensed Material is licensed under this
252
+ Public License, and include the text of, or the URI or
253
+ hyperlink to, this Public License.
254
+
255
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
256
+ reasonable manner based on the medium, means, and context in
257
+ which You Share the Licensed Material. For example, it may be
258
+ reasonable to satisfy the conditions by providing a URI or
259
+ hyperlink to a resource that includes the required
260
+ information.
261
+
262
+ 3. If requested by the Licensor, You must remove any of the
263
+ information required by Section 3(a)(1)(A) to the extent
264
+ reasonably practicable.
265
+
266
+ 4. If You Share Adapted Material You produce, the Adapter's
267
+ License You apply must not prevent recipients of the Adapted
268
+ Material from complying with this Public License.
269
+
270
+ Section 4 -- Sui Generis Database Rights.
271
+
272
+ Where the Licensed Rights include Sui Generis Database Rights that
273
+ apply to Your use of the Licensed Material:
274
+
275
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
276
+ to extract, reuse, reproduce, and Share all or a substantial
277
+ portion of the contents of the database for NonCommercial purposes
278
+ only;
279
+
280
+ b. if You include all or a substantial portion of the database
281
+ contents in a database in which You have Sui Generis Database
282
+ Rights, then the database in which You have Sui Generis Database
283
+ Rights (but not its individual contents) is Adapted Material; and
284
+
285
+ c. You must comply with the conditions in Section 3(a) if You Share
286
+ all or a substantial portion of the contents of the database.
287
+
288
+ For the avoidance of doubt, this Section 4 supplements and does not
289
+ replace Your obligations under this Public License where the Licensed
290
+ Rights include other Copyright and Similar Rights.
291
+
292
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
293
+
294
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
295
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
296
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
297
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
298
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
299
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
300
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
301
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
302
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
303
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
304
+
305
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
306
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
307
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
308
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
309
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
310
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
311
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
312
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
313
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
314
+
315
+ c. The disclaimer of warranties and limitation of liability provided
316
+ above shall be interpreted in a manner that, to the extent
317
+ possible, most closely approximates an absolute disclaimer and
318
+ waiver of all liability.
319
+
320
+ Section 6 -- Term and Termination.
321
+
322
+ a. This Public License applies for the term of the Copyright and
323
+ Similar Rights licensed here. However, if You fail to comply with
324
+ this Public License, then Your rights under this Public License
325
+ terminate automatically.
326
+
327
+ b. Where Your right to use the Licensed Material has terminated under
328
+ Section 6(a), it reinstates:
329
+
330
+ 1. automatically as of the date the violation is cured, provided
331
+ it is cured within 30 days of Your discovery of the
332
+ violation; or
333
+
334
+ 2. upon express reinstatement by the Licensor.
335
+
336
+ For the avoidance of doubt, this Section 6(b) does not affect any
337
+ right the Licensor may have to seek remedies for Your violations
338
+ of this Public License.
339
+
340
+ c. For the avoidance of doubt, the Licensor may also offer the
341
+ Licensed Material under separate terms or conditions or stop
342
+ distributing the Licensed Material at any time; however, doing so
343
+ will not terminate this Public License.
344
+
345
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
346
+ License.
347
+
348
+ Section 7 -- Other Terms and Conditions.
349
+
350
+ a. The Licensor shall not be bound by any additional or different
351
+ terms or conditions communicated by You unless expressly agreed.
352
+
353
+ b. Any arrangements, understandings, or agreements regarding the
354
+ Licensed Material not stated herein are separate from and
355
+ independent of the terms and conditions of this Public License.
356
+
357
+ Section 8 -- Interpretation.
358
+
359
+ a. For the avoidance of doubt, this Public License does not, and
360
+ shall not be interpreted to, reduce, limit, restrict, or impose
361
+ conditions on any use of the Licensed Material that could lawfully
362
+ be made without permission under this Public License.
363
+
364
+ b. To the extent possible, if any provision of this Public License is
365
+ deemed unenforceable, it shall be automatically reformed to the
366
+ minimum extent necessary to make it enforceable. If the provision
367
+ cannot be reformed, it shall be severed from this Public License
368
+ without affecting the enforceability of the remaining terms and
369
+ conditions.
370
+
371
+ c. No term or condition of this Public License will be waived and no
372
+ failure to comply consented to unless expressly agreed to by the
373
+ Licensor.
374
+
375
+ d. Nothing in this Public License constitutes or may be interpreted
376
+ as a limitation upon, or waiver of, any privileges and immunities
377
+ that apply to the Licensor or You, including from the legal
378
+ processes of any jurisdiction or authority.
379
+
380
+ =======================================================================
381
+
382
+ Creative Commons is not a party to its public
383
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
384
+ its public licenses to material it publishes and in those instances
385
+ will be considered the “Licensor.” The text of the Creative Commons
386
+ public licenses is dedicated to the public domain under the CC0 Public
387
+ Domain Dedication. Except for the limited purpose of indicating that
388
+ material is shared under a Creative Commons public license or as
389
+ otherwise permitted by the Creative Commons policies published at
390
+ creativecommons.org/policies, Creative Commons does not authorize the
391
+ use of the trademark "Creative Commons" or any other trademark or logo
392
+ of Creative Commons without its prior written consent including,
393
+ without limitation, in connection with any unauthorized modifications
394
+ to any of its public licenses or any other arrangements,
395
+ understandings, or agreements concerning use of licensed material. For
396
+ the avoidance of doubt, this paragraph does not form part of the
397
+ public licenses.
398
+
399
+ Creative Commons may be contacted at creativecommons.org.
MANIFEST.in ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ include Makefile
2
+ include LICENSE
3
+ include LICENSE_weights
4
+ include *.md
5
+ include *.ini
6
+ include requirements.txt
7
+ include audiocraft/py.typed
8
+ include assets/*.mp3
9
+ recursive-include conf *.yaml
Makefile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INTEG=AUDIOCRAFT_DORA_DIR="/tmp/magma_$(USER)" python3 -m dora -v run --clear device=cpu dataset.num_workers=0 optim.epochs=1 \
2
+ dataset.train.num_samples=10 dataset.valid.num_samples=10 \
3
+ dataset.evaluate.num_samples=10 dataset.generate.num_samples=2 sample_rate=16000 \
4
+ logging.level=DEBUG
5
+ INTEG_COMPRESSION = $(INTEG) solver=compression/debug rvq.n_q=2 rvq.bins=48 checkpoint.save_last=true # SIG is 5091833e
6
+ INTEG_MUSICGEN = $(INTEG) solver=musicgen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
7
+ transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false # Using compression model from 5091833e
8
+ INTEG_AUDIOGEN = $(INTEG) solver=audiogen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
9
+ transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false # Using compression model from 5091833e
10
+ INTEG_MBD = $(INTEG) solver=diffusion/debug dset=audio/example \
11
+ checkpoint.save_last=false # Using compression model from 616d7b3c
12
+
13
+ default: linter tests
14
+
15
+ install:
16
+ pip install -U pip
17
+ pip install -U -e '.[dev]'
18
+
19
+ linter:
20
+ flake8 audiocraft && mypy audiocraft
21
+ flake8 tests && mypy tests
22
+
23
+ tests:
24
+ coverage run -m pytest tests
25
+ coverage report
26
+
27
+ tests_integ:
28
+ $(INTEG_COMPRESSION)
29
+ $(INTEG_MBD)
30
+ $(INTEG_MUSICGEN)
31
+ $(INTEG_AUDIOGEN)
32
+
33
+
34
+ api_docs:
35
+ pdoc3 --html -o api_docs -f audiocraft
36
+
37
+ dist:
38
+ python setup.py sdist
39
+
40
+ .PHONY: linter tests api_docs dist
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: Musicgen1
3
- emoji: 🏢
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 3.44.4
 
8
  app_file: app.py
 
 
9
  pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ license: openrail
3
+ title: videoshop-backend
 
 
4
  sdk: gradio
5
+ sdk_version: 3.41.2
6
+ emoji: 🚀
7
  app_file: app.py
8
+ colorFrom: green
9
+ colorTo: blue
10
  pinned: false
11
+ ---
 
 
app.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from concurrent.futures import ProcessPoolExecutor
3
+ import os
4
+ from pathlib import Path
5
+ import subprocess as sp
6
+ from tempfile import NamedTemporaryFile
7
+ import time
8
+ import typing as tp
9
+ import warnings
10
+
11
+ import torch
12
+ import gradio as gr
13
+
14
+ from audiocraft.data.audio_utils import convert_audio
15
+ from audiocraft.data.audio import audio_write
16
+ from audiocraft.models import MusicGen, MultiBandDiffusion
17
+
18
+
19
+ MODEL = None # Last used model
20
+ IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
21
+ print(IS_BATCHED)
22
+ MAX_BATCH_SIZE = 12
23
+ BATCHED_DURATION = 15
24
+ INTERRUPTING = False
25
+ MBD = None
26
+ # We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
27
+ _old_call = sp.call
28
+
29
+
30
+ def _call_nostderr(*args, **kwargs):
31
+ # Avoid ffmpeg vomiting on the logs.
32
+ kwargs['stderr'] = sp.DEVNULL
33
+ kwargs['stdout'] = sp.DEVNULL
34
+ _old_call(*args, **kwargs)
35
+
36
+
37
+ sp.call = _call_nostderr
38
+ # Preallocating the pool of processes.
39
+ pool = ProcessPoolExecutor(4)
40
+ pool.__enter__()
41
+
42
+
43
+ def interrupt():
44
+ global INTERRUPTING
45
+ INTERRUPTING = True
46
+
47
+
48
+ class FileCleaner:
49
+ def __init__(self, file_lifetime: float = 3600):
50
+ self.file_lifetime = file_lifetime
51
+ self.files = []
52
+
53
+ def add(self, path: tp.Union[str, Path]):
54
+ self._cleanup()
55
+ self.files.append((time.time(), Path(path)))
56
+
57
+ def _cleanup(self):
58
+ now = time.time()
59
+ for time_added, path in list(self.files):
60
+ if now - time_added > self.file_lifetime:
61
+ if path.exists():
62
+ path.unlink()
63
+ self.files.pop(0)
64
+ else:
65
+ break
66
+
67
+
68
+ file_cleaner = FileCleaner()
69
+
70
+
71
+ def make_waveform(*args, **kwargs):
72
+ # Further remove some warnings.
73
+ be = time.time()
74
+ with warnings.catch_warnings():
75
+ warnings.simplefilter('ignore')
76
+ out = gr.make_waveform(*args, **kwargs)
77
+ print("Make a video took", time.time() - be)
78
+ return out
79
+
80
+
81
+ def load_model(version='facebook/musicgen-melody'):
82
+ global MODEL
83
+ print("Loading model", version)
84
+ if MODEL is None or MODEL.name != version:
85
+ MODEL = MusicGen.get_pretrained(version)
86
+
87
+
88
+ def load_diffusion():
89
+ global MBD
90
+ if MBD is None:
91
+ print("loading MBD")
92
+ MBD = MultiBandDiffusion.get_mbd_musicgen()
93
+
94
+
95
+ def _do_predictions(texts, melodies, duration, progress=False, **gen_kwargs):
96
+ MODEL.set_generation_params(duration=duration, **gen_kwargs)
97
+ print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
98
+ be = time.time()
99
+ processed_melodies = []
100
+ target_sr = 32000
101
+ target_ac = 1
102
+ for melody in melodies:
103
+ if melody is None:
104
+ processed_melodies.append(None)
105
+ else:
106
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
107
+ if melody.dim() == 1:
108
+ melody = melody[None]
109
+ melody = melody[..., :int(sr * duration)]
110
+ melody = convert_audio(melody, sr, target_sr, target_ac)
111
+ processed_melodies.append(melody)
112
+
113
+ if any(m is not None for m in processed_melodies):
114
+ outputs = MODEL.generate_with_chroma(
115
+ descriptions=texts,
116
+ melody_wavs=processed_melodies,
117
+ melody_sample_rate=target_sr,
118
+ progress=progress,
119
+ return_tokens=USE_DIFFUSION
120
+ )
121
+ else:
122
+ outputs = MODEL.generate(texts, progress=progress, return_tokens=USE_DIFFUSION)
123
+ if USE_DIFFUSION:
124
+ outputs_diffusion = MBD.tokens_to_wav(outputs[1])
125
+ outputs = torch.cat([outputs[0], outputs_diffusion], dim=0)
126
+ outputs = outputs.detach().cpu().float()
127
+ pending_videos = []
128
+ out_wavs = []
129
+ for output in outputs:
130
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
131
+ audio_write(
132
+ file.name, output, MODEL.sample_rate, strategy="loudness",
133
+ loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
134
+ pending_videos.append(pool.submit(make_waveform, file.name))
135
+ out_wavs.append(file.name)
136
+ file_cleaner.add(file.name)
137
+ out_videos = [pending_video.result() for pending_video in pending_videos]
138
+ for video in out_videos:
139
+ file_cleaner.add(video)
140
+ print("batch finished", len(texts), time.time() - be)
141
+ print("Tempfiles currently stored: ", len(file_cleaner.files))
142
+ return out_videos, out_wavs
143
+
144
+
145
+ def predict_batched(texts, melodies):
146
+ max_text_length = 512
147
+ texts = [text[:max_text_length] for text in texts]
148
+ load_model('facebook/musicgen-melody')
149
+ res = _do_predictions(texts, melodies, BATCHED_DURATION)
150
+ return res
151
+
152
+
153
+ def predict_full(model, decoder, text, melody, duration, topk, topp, temperature, cfg_coef, progress=gr.Progress()):
154
+ global INTERRUPTING
155
+ global USE_DIFFUSION
156
+ INTERRUPTING = False
157
+ if temperature < 0:
158
+ raise gr.Error("Temperature must be >= 0.")
159
+ if topk < 0:
160
+ raise gr.Error("Topk must be non-negative.")
161
+ if topp < 0:
162
+ raise gr.Error("Topp must be non-negative.")
163
+
164
+ topk = int(topk)
165
+ if decoder == "MultiBand_Diffusion":
166
+ USE_DIFFUSION = True
167
+ load_diffusion()
168
+ else:
169
+ USE_DIFFUSION = False
170
+ load_model(model)
171
+
172
+ def _progress(generated, to_generate):
173
+ progress((min(generated, to_generate), to_generate))
174
+ if INTERRUPTING:
175
+ raise gr.Error("Interrupted.")
176
+ MODEL.set_custom_progress_callback(_progress)
177
+
178
+ videos, wavs = _do_predictions(
179
+ [text], [melody], duration, progress=True,
180
+ top_k=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
181
+ if USE_DIFFUSION:
182
+ return videos[0], wavs[0], videos[1], wavs[1]
183
+ return videos[0], wavs[0], None, None
184
+
185
+
186
+ def toggle_audio_src(choice):
187
+ if choice == "mic":
188
+ return gr.update(source="microphone", value=None, label="Microphone")
189
+ else:
190
+ return gr.update(source="upload", value=None, label="File")
191
+
192
+
193
+ def toggle_diffusion(choice):
194
+ if choice == "MultiBand_Diffusion":
195
+ return [gr.update(visible=True)] * 2
196
+ else:
197
+ return [gr.update(visible=False)] * 2
198
+
199
+
200
+ def ui_full(launch_kwargs):
201
+ with gr.Blocks() as interface:
202
+
203
+ with gr.Row():
204
+ with gr.Column():
205
+ with gr.Row():
206
+ text = gr.Text(label="Input Text", interactive=True)
207
+ # with gr.Column():
208
+ # radio = gr.Radio(["file", "mic"], value="file",
209
+ # label="Condition on a melody (optional) File or Mic")
210
+ # melody = gr.Audio(source="upload", type="numpy", label="File",
211
+ # interactive=True, elem_id="melody-input")
212
+ with gr.Row():
213
+ submit = gr.Button("Submit")
214
+ # Adapted from https://github.com/rkfg/audiocraft/blob/long/app.py, MIT license.
215
+ _ = gr.Button("Interrupt").click(fn=interrupt, queue=False)
216
+ with gr.Row():
217
+ model = gr.Radio(["facebook/musicgen-melody", "facebook/musicgen-medium", "facebook/musicgen-small",
218
+ "facebook/musicgen-large"],
219
+ label="Model", value="facebook/musicgen-melody", interactive=True)
220
+ # with gr.Row():
221
+ # decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
222
+ # label="Decoder", value="Default", interactive=True)
223
+ # decoder = "Default"
224
+ with gr.Row():
225
+ duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True)
226
+ # with gr.Row():
227
+ # topk = gr.Number(label="Top-k", value=250, interactive=True)
228
+ # topp = gr.Number(label="Top-p", value=0, interactive=True)
229
+ # temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
230
+ # cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
231
+
232
+ with gr.Column():
233
+ output = gr.Video(label="Generated Music")
234
+ audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
235
+ # diffusion_output = gr.Video(label="MultiBand Diffusion Decoder")
236
+ # audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath')
237
+
238
+ melody = gr.Audio(source= None, type="numpy", label="File",
239
+ interactive=False, visible= False, elem_id="melody-input")
240
+ decoder = gr.Radio(["Default", "MultiBand_Diffusion"],
241
+ label="Decoder", value="Default", interactive=True, visible= False)
242
+ # duration = gr.Slider(minimum=1, maximum=120, value=10, label="Duration", interactive=True, visible= False)
243
+ topk = gr.Number(label="Top-k", value=250, interactive=True, visible= False)
244
+ topp = gr.Number(label="Top-p", value=0, interactive=True, visible= False)
245
+ temperature = gr.Number(label="Temperature", value=1.0, interactive=True, visible= False)
246
+ cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True, visible= False)
247
+ diffusion_output = gr.Video(label="MultiBand Diffusion Decoder" , visible=False)
248
+ audio_diffusion = gr.Audio(label="MultiBand Diffusion Decoder (wav)", type='filepath', visible= False)
249
+
250
+ print("melody", melody)
251
+ print("decoder", decoder)
252
+ print("topk", topk)
253
+ print("topp", topp)
254
+ print("cfg_coef", cfg_coef)
255
+ print("diffusion_output" , diffusion_output)
256
+ print("audio_diffusion" , audio_diffusion)
257
+
258
+ submit.click(toggle_diffusion, decoder, [diffusion_output, audio_diffusion], queue=False,
259
+ show_progress=False).then(predict_full, inputs=[model, decoder, text, melody, duration, topk, topp,
260
+ temperature, cfg_coef],
261
+ outputs=[output, audio_output, diffusion_output, audio_diffusion])
262
+ # radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
263
+
264
+ # gr.Examples(
265
+ # fn=predict_full,
266
+ # examples=[
267
+ # [
268
+ # "An 80s driving pop song with heavy drums and synth pads in the background",
269
+ # "./assets/bach.mp3",
270
+ # "facebook/musicgen-melody",
271
+ # "Default"
272
+ # ],
273
+ # [
274
+ # "A cheerful country song with acoustic guitars",
275
+ # "./assets/bolero_ravel.mp3",
276
+ # "facebook/musicgen-melody",
277
+ # "Default"
278
+ # ],
279
+ # [
280
+ # "90s rock song with electric guitar and heavy drums",
281
+ # None,
282
+ # "facebook/musicgen-medium",
283
+ # "Default"
284
+ # ],
285
+ # [
286
+ # "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
287
+ # "./assets/bach.mp3",
288
+ # "facebook/musicgen-melody",
289
+ # "Default"
290
+ # ],
291
+ # [
292
+ # "lofi slow bpm electro chill with organic samples",
293
+ # None,
294
+ # "facebook/musicgen-medium",
295
+ # "Default"
296
+ # ],
297
+ # [
298
+ # "Punk rock with loud drum and power guitar",
299
+ # None,
300
+ # "facebook/musicgen-medium",
301
+ # "MultiBand_Diffusion"
302
+ # ],
303
+ # ],
304
+ # inputs=[text, melody, model, decoder],
305
+ # outputs=[output]
306
+ # )
307
+ gr.Markdown(
308
+ """
309
+ """
310
+ )
311
+
312
+ interface.queue().launch(**launch_kwargs)
313
+
314
+ def ui_batched(launch_kwargs):
315
+ with gr.Blocks() as demo:
316
+ gr.Markdown(
317
+ """
318
+ This project generate Music from prompt.
319
+ """
320
+ )
321
+ with gr.Row():
322
+ with gr.Column():
323
+ with gr.Row():
324
+ text = gr.Text(label="Describe your music", lines=2, interactive=True)
325
+ with gr.Column():
326
+ radio = gr.Radio(["file", "mic"], value="file",
327
+ label="Condition on a melody (optional) File or Mic")
328
+ melody = gr.Audio(source="upload", type="numpy", label="File",
329
+ interactive=True, elem_id="melody-input")
330
+ with gr.Row():
331
+ submit = gr.Button("Generate")
332
+ with gr.Column():
333
+ output = gr.Video(label="Generated Music")
334
+ audio_output = gr.Audio(label="Generated Music (wav)", type='filepath')
335
+ submit.click(predict_batched, inputs=[text, melody],
336
+ outputs=[output, audio_output], batch=True, max_batch_size=MAX_BATCH_SIZE)
337
+ radio.change(toggle_audio_src, radio, [melody], queue=False, show_progress=False)
338
+ gr.Examples(
339
+ fn=predict_batched,
340
+ # examples=[
341
+ # [
342
+ # "An 80s driving pop song with heavy drums and synth pads in the background",
343
+ # "./assets/bach.mp3",
344
+ # ],
345
+ # [
346
+ # "A cheerful country song with acoustic guitars",
347
+ # "./assets/bolero_ravel.mp3",
348
+ # ],
349
+ # [
350
+ # "90s rock song with electric guitar and heavy drums",
351
+ # None,
352
+ # ],
353
+ # [
354
+ # "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
355
+ # "./assets/bach.mp3",
356
+ # ],
357
+ # [
358
+ # "lofi slow bpm electro chill with organic samples",
359
+ # None,
360
+ # ],
361
+ # ],
362
+ examples=[
363
+
364
+ ],
365
+ inputs=[text, melody],
366
+ outputs=[output]
367
+ )
368
+ gr.Markdown("""
369
+
370
+ """)
371
+
372
+ demo.queue(max_size=8 * 4).launch(**launch_kwargs)
373
+
374
+
375
+ if __name__ == "__main__":
376
+ parser = argparse.ArgumentParser()
377
+ parser.add_argument(
378
+ '--listen',
379
+ type=str,
380
+ default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
381
+ help='IP to listen on for connections to Gradio',
382
+ )
383
+ parser.add_argument(
384
+ '--username', type=str, default='', help='Username for authentication'
385
+ )
386
+ parser.add_argument(
387
+ '--password', type=str, default='', help='Password for authentication'
388
+ )
389
+ parser.add_argument(
390
+ '--server_port',
391
+ type=int,
392
+ default=0,
393
+ help='Port to run the server listener on',
394
+ )
395
+ parser.add_argument(
396
+ '--inbrowser', action='store_true', help='Open in browser'
397
+ )
398
+ parser.add_argument(
399
+ '--share', action='store_true', help='Share the gradio UI'
400
+ )
401
+
402
+ args = parser.parse_args()
403
+
404
+ launch_kwargs = {}
405
+ launch_kwargs['server_name'] = args.listen
406
+
407
+ if args.username and args.password:
408
+ launch_kwargs['auth'] = (args.username, args.password)
409
+ if args.server_port:
410
+ launch_kwargs['server_port'] = args.server_port
411
+ if args.inbrowser:
412
+ launch_kwargs['inbrowser'] = args.inbrowser
413
+ if args.share:
414
+ launch_kwargs['share'] = args.share
415
+
416
+ # Show the interface
417
+ if IS_BATCHED:
418
+ global USE_DIFFUSION
419
+ USE_DIFFUSION = False
420
+ ui_batched(launch_kwargs)
421
+ else:
422
+ ui_full(launch_kwargs)
assets/a_duck_quacking_as_birds_chirp_and_a_pigeon_cooing.mp3 ADDED
Binary file (15.2 kB). View file
 
assets/bach.mp3 ADDED
Binary file (160 kB). View file
 
assets/bolero_ravel.mp3 ADDED
Binary file (161 kB). View file
 
assets/sirens_and_a_humming_engine_approach_and_pass.mp3 ADDED
Binary file (15.2 kB). View file
 
audiocraft/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """
7
+ AudioCraft is a general framework for training audio generative models.
8
+ At the moment we provide the training code for:
9
+
10
+ - [MusicGen](https://arxiv.org/abs/2306.05284), a state-of-the-art
11
+ text-to-music and melody+text autoregressive generative model.
12
+ For the solver, see `audiocraft.solvers.musicgen.MusicGenSolver`, and for the model,
13
+ `audiocraft.models.musicgen.MusicGen`.
14
+ - [AudioGen](https://arxiv.org/abs/2209.15352), a state-of-the-art
15
+ text-to-general-audio generative model.
16
+ - [EnCodec](https://arxiv.org/abs/2210.13438), efficient and high fidelity
17
+ neural audio codec which provides an excellent tokenizer for autoregressive language models.
18
+ See `audiocraft.solvers.compression.CompressionSolver`, and `audiocraft.models.encodec.EncodecModel`.
19
+ - [MultiBandDiffusion](TODO), alternative diffusion-based decoder compatible with EnCodec that
20
+ improves the perceived quality and reduces the artifacts coming from adversarial decoders.
21
+ """
22
+
23
+ # flake8: noqa
24
+ from . import data, modules, models
25
+
26
+ __version__ = '1.0.0'
audiocraft/adversarial/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Adversarial losses and discriminator architectures."""
7
+
8
+ # flake8: noqa
9
+ from .discriminators import (
10
+ MultiPeriodDiscriminator,
11
+ MultiScaleDiscriminator,
12
+ MultiScaleSTFTDiscriminator
13
+ )
14
+ from .losses import (
15
+ AdversarialLoss,
16
+ AdvLossType,
17
+ get_adv_criterion,
18
+ get_fake_criterion,
19
+ get_real_criterion,
20
+ FeatLossType,
21
+ FeatureMatchingLoss
22
+ )
audiocraft/adversarial/discriminators/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # flake8: noqa
8
+ from .mpd import MultiPeriodDiscriminator
9
+ from .msd import MultiScaleDiscriminator
10
+ from .msstftd import MultiScaleSTFTDiscriminator
audiocraft/adversarial/discriminators/base.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from abc import ABC, abstractmethod
8
+ import typing as tp
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+
13
+
14
+ FeatureMapType = tp.List[torch.Tensor]
15
+ LogitsType = torch.Tensor
16
+ MultiDiscriminatorOutputType = tp.Tuple[tp.List[LogitsType], tp.List[FeatureMapType]]
17
+
18
+
19
+ class MultiDiscriminator(ABC, nn.Module):
20
+ """Base implementation for discriminators composed of sub-discriminators acting at different scales.
21
+ """
22
+ def __init__(self):
23
+ super().__init__()
24
+
25
+ @abstractmethod
26
+ def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
27
+ ...
28
+
29
+ @property
30
+ @abstractmethod
31
+ def num_discriminators(self) -> int:
32
+ """Number of discriminators.
33
+ """
34
+ ...
audiocraft/adversarial/discriminators/mpd.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import typing as tp
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+ from ...modules import NormConv2d
14
+ from .base import MultiDiscriminator, MultiDiscriminatorOutputType
15
+
16
+
17
+ def get_padding(kernel_size: int, dilation: int = 1) -> int:
18
+ return int((kernel_size * dilation - dilation) / 2)
19
+
20
+
21
+ class PeriodDiscriminator(nn.Module):
22
+ """Period sub-discriminator.
23
+
24
+ Args:
25
+ period (int): Period between samples of audio.
26
+ in_channels (int): Number of input channels.
27
+ out_channels (int): Number of output channels.
28
+ n_layers (int): Number of convolutional layers.
29
+ kernel_sizes (list of int): Kernel sizes for convolutions.
30
+ stride (int): Stride for convolutions.
31
+ filters (int): Initial number of filters in convolutions.
32
+ filters_scale (int): Multiplier of number of filters as we increase depth.
33
+ max_filters (int): Maximum number of filters.
34
+ norm (str): Normalization method.
35
+ activation (str): Activation function.
36
+ activation_params (dict): Parameters to provide to the activation function.
37
+ """
38
+ def __init__(self, period: int, in_channels: int = 1, out_channels: int = 1,
39
+ n_layers: int = 5, kernel_sizes: tp.List[int] = [5, 3], stride: int = 3,
40
+ filters: int = 8, filters_scale: int = 4, max_filters: int = 1024,
41
+ norm: str = 'weight_norm', activation: str = 'LeakyReLU',
42
+ activation_params: dict = {'negative_slope': 0.2}):
43
+ super().__init__()
44
+ self.period = period
45
+ self.n_layers = n_layers
46
+ self.activation = getattr(torch.nn, activation)(**activation_params)
47
+ self.convs = nn.ModuleList()
48
+ in_chs = in_channels
49
+ for i in range(self.n_layers):
50
+ out_chs = min(filters * (filters_scale ** (i + 1)), max_filters)
51
+ eff_stride = 1 if i == self.n_layers - 1 else stride
52
+ self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_sizes[0], 1), stride=(eff_stride, 1),
53
+ padding=((kernel_sizes[0] - 1) // 2, 0), norm=norm))
54
+ in_chs = out_chs
55
+ self.conv_post = NormConv2d(in_chs, out_channels, kernel_size=(kernel_sizes[1], 1), stride=1,
56
+ padding=((kernel_sizes[1] - 1) // 2, 0), norm=norm)
57
+
58
+ def forward(self, x: torch.Tensor):
59
+ fmap = []
60
+ # 1d to 2d
61
+ b, c, t = x.shape
62
+ if t % self.period != 0: # pad first
63
+ n_pad = self.period - (t % self.period)
64
+ x = F.pad(x, (0, n_pad), 'reflect')
65
+ t = t + n_pad
66
+ x = x.view(b, c, t // self.period, self.period)
67
+
68
+ for conv in self.convs:
69
+ x = conv(x)
70
+ x = self.activation(x)
71
+ fmap.append(x)
72
+ x = self.conv_post(x)
73
+ fmap.append(x)
74
+ # x = torch.flatten(x, 1, -1)
75
+
76
+ return x, fmap
77
+
78
+
79
+ class MultiPeriodDiscriminator(MultiDiscriminator):
80
+ """Multi-Period (MPD) Discriminator.
81
+
82
+ Args:
83
+ in_channels (int): Number of input channels.
84
+ out_channels (int): Number of output channels.
85
+ periods (Sequence[int]): Periods between samples of audio for the sub-discriminators.
86
+ **kwargs: Additional args for `PeriodDiscriminator`
87
+ """
88
+ def __init__(self, in_channels: int = 1, out_channels: int = 1,
89
+ periods: tp.Sequence[int] = [2, 3, 5, 7, 11], **kwargs):
90
+ super().__init__()
91
+ self.discriminators = nn.ModuleList([
92
+ PeriodDiscriminator(p, in_channels, out_channels, **kwargs) for p in periods
93
+ ])
94
+
95
+ @property
96
+ def num_discriminators(self):
97
+ return len(self.discriminators)
98
+
99
+ def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
100
+ logits = []
101
+ fmaps = []
102
+ for disc in self.discriminators:
103
+ logit, fmap = disc(x)
104
+ logits.append(logit)
105
+ fmaps.append(fmap)
106
+ return logits, fmaps
audiocraft/adversarial/discriminators/msd.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import typing as tp
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn as nn
12
+
13
+ from ...modules import NormConv1d
14
+ from .base import MultiDiscriminator, MultiDiscriminatorOutputType
15
+
16
+
17
+ class ScaleDiscriminator(nn.Module):
18
+ """Waveform sub-discriminator.
19
+
20
+ Args:
21
+ in_channels (int): Number of input channels.
22
+ out_channels (int): Number of output channels.
23
+ kernel_sizes (Sequence[int]): Kernel sizes for first and last convolutions.
24
+ filters (int): Number of initial filters for convolutions.
25
+ max_filters (int): Maximum number of filters.
26
+ downsample_scales (Sequence[int]): Scale for downsampling implemented as strided convolutions.
27
+ inner_kernel_sizes (Sequence[int] or None): Kernel sizes for inner convolutions.
28
+ groups (Sequence[int] or None): Groups for inner convolutions.
29
+ strides (Sequence[int] or None): Strides for inner convolutions.
30
+ paddings (Sequence[int] or None): Paddings for inner convolutions.
31
+ norm (str): Normalization method.
32
+ activation (str): Activation function.
33
+ activation_params (dict): Parameters to provide to the activation function.
34
+ pad (str): Padding for initial convolution.
35
+ pad_params (dict): Parameters to provide to the padding module.
36
+ """
37
+ def __init__(self, in_channels=1, out_channels=1, kernel_sizes: tp.Sequence[int] = [5, 3],
38
+ filters: int = 16, max_filters: int = 1024, downsample_scales: tp.Sequence[int] = [4, 4, 4, 4],
39
+ inner_kernel_sizes: tp.Optional[tp.Sequence[int]] = None, groups: tp.Optional[tp.Sequence[int]] = None,
40
+ strides: tp.Optional[tp.Sequence[int]] = None, paddings: tp.Optional[tp.Sequence[int]] = None,
41
+ norm: str = 'weight_norm', activation: str = 'LeakyReLU',
42
+ activation_params: dict = {'negative_slope': 0.2}, pad: str = 'ReflectionPad1d',
43
+ pad_params: dict = {}):
44
+ super().__init__()
45
+ assert len(kernel_sizes) == 2
46
+ assert kernel_sizes[0] % 2 == 1
47
+ assert kernel_sizes[1] % 2 == 1
48
+ assert (inner_kernel_sizes is None or len(inner_kernel_sizes) == len(downsample_scales))
49
+ assert (groups is None or len(groups) == len(downsample_scales))
50
+ assert (strides is None or len(strides) == len(downsample_scales))
51
+ assert (paddings is None or len(paddings) == len(downsample_scales))
52
+ self.activation = getattr(torch.nn, activation)(**activation_params)
53
+ self.convs = nn.ModuleList()
54
+ self.convs.append(
55
+ nn.Sequential(
56
+ getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
57
+ NormConv1d(in_channels, filters, kernel_size=np.prod(kernel_sizes), stride=1, norm=norm)
58
+ )
59
+ )
60
+
61
+ in_chs = filters
62
+ for i, downsample_scale in enumerate(downsample_scales):
63
+ out_chs = min(in_chs * downsample_scale, max_filters)
64
+ default_kernel_size = downsample_scale * 10 + 1
65
+ default_stride = downsample_scale
66
+ default_padding = (default_kernel_size - 1) // 2
67
+ default_groups = in_chs // 4
68
+ self.convs.append(
69
+ NormConv1d(in_chs, out_chs,
70
+ kernel_size=inner_kernel_sizes[i] if inner_kernel_sizes else default_kernel_size,
71
+ stride=strides[i] if strides else default_stride,
72
+ groups=groups[i] if groups else default_groups,
73
+ padding=paddings[i] if paddings else default_padding,
74
+ norm=norm))
75
+ in_chs = out_chs
76
+
77
+ out_chs = min(in_chs * 2, max_filters)
78
+ self.convs.append(NormConv1d(in_chs, out_chs, kernel_size=kernel_sizes[0], stride=1,
79
+ padding=(kernel_sizes[0] - 1) // 2, norm=norm))
80
+ self.conv_post = NormConv1d(out_chs, out_channels, kernel_size=kernel_sizes[1], stride=1,
81
+ padding=(kernel_sizes[1] - 1) // 2, norm=norm)
82
+
83
+ def forward(self, x: torch.Tensor):
84
+ fmap = []
85
+ for layer in self.convs:
86
+ x = layer(x)
87
+ x = self.activation(x)
88
+ fmap.append(x)
89
+ x = self.conv_post(x)
90
+ fmap.append(x)
91
+ # x = torch.flatten(x, 1, -1)
92
+ return x, fmap
93
+
94
+
95
+ class MultiScaleDiscriminator(MultiDiscriminator):
96
+ """Multi-Scale (MSD) Discriminator,
97
+
98
+ Args:
99
+ in_channels (int): Number of input channels.
100
+ out_channels (int): Number of output channels.
101
+ downsample_factor (int): Downsampling factor between the different scales.
102
+ scale_norms (Sequence[str]): Normalization for each sub-discriminator.
103
+ **kwargs: Additional args for ScaleDiscriminator.
104
+ """
105
+ def __init__(self, in_channels: int = 1, out_channels: int = 1, downsample_factor: int = 2,
106
+ scale_norms: tp.Sequence[str] = ['weight_norm', 'weight_norm', 'weight_norm'], **kwargs):
107
+ super().__init__()
108
+ self.discriminators = nn.ModuleList([
109
+ ScaleDiscriminator(in_channels, out_channels, norm=norm, **kwargs) for norm in scale_norms
110
+ ])
111
+ self.downsample = nn.AvgPool1d(downsample_factor * 2, downsample_factor, padding=downsample_factor)
112
+
113
+ @property
114
+ def num_discriminators(self):
115
+ return len(self.discriminators)
116
+
117
+ def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
118
+ logits = []
119
+ fmaps = []
120
+ for i, disc in enumerate(self.discriminators):
121
+ if i != 0:
122
+ self.downsample(x)
123
+ logit, fmap = disc(x)
124
+ logits.append(logit)
125
+ fmaps.append(fmap)
126
+ return logits, fmaps
audiocraft/adversarial/discriminators/msstftd.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import typing as tp
8
+
9
+ import torchaudio
10
+ import torch
11
+ from torch import nn
12
+ from einops import rearrange
13
+
14
+ from ...modules import NormConv2d
15
+ from .base import MultiDiscriminator, MultiDiscriminatorOutputType
16
+
17
+
18
+ def get_2d_padding(kernel_size: tp.Tuple[int, int], dilation: tp.Tuple[int, int] = (1, 1)):
19
+ return (((kernel_size[0] - 1) * dilation[0]) // 2, ((kernel_size[1] - 1) * dilation[1]) // 2)
20
+
21
+
22
+ class DiscriminatorSTFT(nn.Module):
23
+ """STFT sub-discriminator.
24
+
25
+ Args:
26
+ filters (int): Number of filters in convolutions.
27
+ in_channels (int): Number of input channels.
28
+ out_channels (int): Number of output channels.
29
+ n_fft (int): Size of FFT for each scale.
30
+ hop_length (int): Length of hop between STFT windows for each scale.
31
+ kernel_size (tuple of int): Inner Conv2d kernel sizes.
32
+ stride (tuple of int): Inner Conv2d strides.
33
+ dilations (list of int): Inner Conv2d dilation on the time dimension.
34
+ win_length (int): Window size for each scale.
35
+ normalized (bool): Whether to normalize by magnitude after stft.
36
+ norm (str): Normalization method.
37
+ activation (str): Activation function.
38
+ activation_params (dict): Parameters to provide to the activation function.
39
+ growth (int): Growth factor for the filters.
40
+ """
41
+ def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1,
42
+ n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, max_filters: int = 1024,
43
+ filters_scale: int = 1, kernel_size: tp.Tuple[int, int] = (3, 9), dilations: tp.List = [1, 2, 4],
44
+ stride: tp.Tuple[int, int] = (1, 2), normalized: bool = True, norm: str = 'weight_norm',
45
+ activation: str = 'LeakyReLU', activation_params: dict = {'negative_slope': 0.2}):
46
+ super().__init__()
47
+ assert len(kernel_size) == 2
48
+ assert len(stride) == 2
49
+ self.filters = filters
50
+ self.in_channels = in_channels
51
+ self.out_channels = out_channels
52
+ self.n_fft = n_fft
53
+ self.hop_length = hop_length
54
+ self.win_length = win_length
55
+ self.normalized = normalized
56
+ self.activation = getattr(torch.nn, activation)(**activation_params)
57
+ self.spec_transform = torchaudio.transforms.Spectrogram(
58
+ n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window_fn=torch.hann_window,
59
+ normalized=self.normalized, center=False, pad_mode=None, power=None)
60
+ spec_channels = 2 * self.in_channels
61
+ self.convs = nn.ModuleList()
62
+ self.convs.append(
63
+ NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size))
64
+ )
65
+ in_chs = min(filters_scale * self.filters, max_filters)
66
+ for i, dilation in enumerate(dilations):
67
+ out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
68
+ self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,
69
+ dilation=(dilation, 1), padding=get_2d_padding(kernel_size, (dilation, 1)),
70
+ norm=norm))
71
+ in_chs = out_chs
72
+ out_chs = min((filters_scale ** (len(dilations) + 1)) * self.filters, max_filters)
73
+ self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_size[0], kernel_size[0]),
74
+ padding=get_2d_padding((kernel_size[0], kernel_size[0])),
75
+ norm=norm))
76
+ self.conv_post = NormConv2d(out_chs, self.out_channels,
77
+ kernel_size=(kernel_size[0], kernel_size[0]),
78
+ padding=get_2d_padding((kernel_size[0], kernel_size[0])),
79
+ norm=norm)
80
+
81
+ def forward(self, x: torch.Tensor):
82
+ fmap = []
83
+ z = self.spec_transform(x) # [B, 2, Freq, Frames, 2]
84
+ z = torch.cat([z.real, z.imag], dim=1)
85
+ z = rearrange(z, 'b c w t -> b c t w')
86
+ for i, layer in enumerate(self.convs):
87
+ z = layer(z)
88
+ z = self.activation(z)
89
+ fmap.append(z)
90
+ z = self.conv_post(z)
91
+ return z, fmap
92
+
93
+
94
+ class MultiScaleSTFTDiscriminator(MultiDiscriminator):
95
+ """Multi-Scale STFT (MS-STFT) discriminator.
96
+
97
+ Args:
98
+ filters (int): Number of filters in convolutions.
99
+ in_channels (int): Number of input channels.
100
+ out_channels (int): Number of output channels.
101
+ sep_channels (bool): Separate channels to distinct samples for stereo support.
102
+ n_ffts (Sequence[int]): Size of FFT for each scale.
103
+ hop_lengths (Sequence[int]): Length of hop between STFT windows for each scale.
104
+ win_lengths (Sequence[int]): Window size for each scale.
105
+ **kwargs: Additional args for STFTDiscriminator.
106
+ """
107
+ def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1, sep_channels: bool = False,
108
+ n_ffts: tp.List[int] = [1024, 2048, 512], hop_lengths: tp.List[int] = [256, 512, 128],
109
+ win_lengths: tp.List[int] = [1024, 2048, 512], **kwargs):
110
+ super().__init__()
111
+ assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
112
+ self.sep_channels = sep_channels
113
+ self.discriminators = nn.ModuleList([
114
+ DiscriminatorSTFT(filters, in_channels=in_channels, out_channels=out_channels,
115
+ n_fft=n_ffts[i], win_length=win_lengths[i], hop_length=hop_lengths[i], **kwargs)
116
+ for i in range(len(n_ffts))
117
+ ])
118
+
119
+ @property
120
+ def num_discriminators(self):
121
+ return len(self.discriminators)
122
+
123
+ def _separate_channels(self, x: torch.Tensor) -> torch.Tensor:
124
+ B, C, T = x.shape
125
+ return x.view(-1, 1, T)
126
+
127
+ def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
128
+ logits = []
129
+ fmaps = []
130
+ for disc in self.discriminators:
131
+ logit, fmap = disc(x)
132
+ logits.append(logit)
133
+ fmaps.append(fmap)
134
+ return logits, fmaps
audiocraft/adversarial/losses.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Utility module to handle adversarial losses without requiring to mess up the main training loop.
9
+ """
10
+
11
+ import typing as tp
12
+
13
+ import flashy
14
+ import torch
15
+ import torch.nn as nn
16
+ import torch.nn.functional as F
17
+
18
+
19
+ ADVERSARIAL_LOSSES = ['mse', 'hinge', 'hinge2']
20
+
21
+
22
+ AdvLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor], torch.Tensor]]
23
+ FeatLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor, torch.Tensor], torch.Tensor]]
24
+
25
+
26
+ class AdversarialLoss(nn.Module):
27
+ """Adversary training wrapper.
28
+
29
+ Args:
30
+ adversary (nn.Module): The adversary module will be used to estimate the logits given the fake and real samples.
31
+ We assume here the adversary output is ``Tuple[List[torch.Tensor], List[List[torch.Tensor]]]``
32
+ where the first item is a list of logits and the second item is a list of feature maps.
33
+ optimizer (torch.optim.Optimizer): Optimizer used for training the given module.
34
+ loss (AdvLossType): Loss function for generator training.
35
+ loss_real (AdvLossType): Loss function for adversarial training on logits from real samples.
36
+ loss_fake (AdvLossType): Loss function for adversarial training on logits from fake samples.
37
+ loss_feat (FeatLossType): Feature matching loss function for generator training.
38
+ normalize (bool): Whether to normalize by number of sub-discriminators.
39
+
40
+ Example of usage:
41
+ adv_loss = AdversarialLoss(adversaries, optimizer, loss, loss_real, loss_fake)
42
+ for real in loader:
43
+ noise = torch.randn(...)
44
+ fake = model(noise)
45
+ adv_loss.train_adv(fake, real)
46
+ loss, _ = adv_loss(fake, real)
47
+ loss.backward()
48
+ """
49
+ def __init__(self,
50
+ adversary: nn.Module,
51
+ optimizer: torch.optim.Optimizer,
52
+ loss: AdvLossType,
53
+ loss_real: AdvLossType,
54
+ loss_fake: AdvLossType,
55
+ loss_feat: tp.Optional[FeatLossType] = None,
56
+ normalize: bool = True):
57
+ super().__init__()
58
+ self.adversary: nn.Module = adversary
59
+ flashy.distrib.broadcast_model(self.adversary)
60
+ self.optimizer = optimizer
61
+ self.loss = loss
62
+ self.loss_real = loss_real
63
+ self.loss_fake = loss_fake
64
+ self.loss_feat = loss_feat
65
+ self.normalize = normalize
66
+
67
+ def _save_to_state_dict(self, destination, prefix, keep_vars):
68
+ # Add the optimizer state dict inside our own.
69
+ super()._save_to_state_dict(destination, prefix, keep_vars)
70
+ destination[prefix + 'optimizer'] = self.optimizer.state_dict()
71
+ return destination
72
+
73
+ def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
74
+ # Load optimizer state.
75
+ self.optimizer.load_state_dict(state_dict.pop(prefix + 'optimizer'))
76
+ super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
77
+
78
+ def get_adversary_pred(self, x):
79
+ """Run adversary model, validating expected output format."""
80
+ logits, fmaps = self.adversary(x)
81
+ assert isinstance(logits, list) and all([isinstance(t, torch.Tensor) for t in logits]), \
82
+ f'Expecting a list of tensors as logits but {type(logits)} found.'
83
+ assert isinstance(fmaps, list), f'Expecting a list of features maps but {type(fmaps)} found.'
84
+ for fmap in fmaps:
85
+ assert isinstance(fmap, list) and all([isinstance(f, torch.Tensor) for f in fmap]), \
86
+ f'Expecting a list of tensors as feature maps but {type(fmap)} found.'
87
+ return logits, fmaps
88
+
89
+ def train_adv(self, fake: torch.Tensor, real: torch.Tensor) -> torch.Tensor:
90
+ """Train the adversary with the given fake and real example.
91
+
92
+ We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
93
+ The first item being the logits and second item being a list of feature maps for each sub-discriminator.
94
+
95
+ This will automatically synchronize gradients (with `flashy.distrib.eager_sync_model`)
96
+ and call the optimizer.
97
+ """
98
+ loss = torch.tensor(0., device=fake.device)
99
+ all_logits_fake_is_fake, _ = self.get_adversary_pred(fake.detach())
100
+ all_logits_real_is_fake, _ = self.get_adversary_pred(real.detach())
101
+ n_sub_adversaries = len(all_logits_fake_is_fake)
102
+ for logit_fake_is_fake, logit_real_is_fake in zip(all_logits_fake_is_fake, all_logits_real_is_fake):
103
+ loss += self.loss_fake(logit_fake_is_fake) + self.loss_real(logit_real_is_fake)
104
+
105
+ if self.normalize:
106
+ loss /= n_sub_adversaries
107
+
108
+ self.optimizer.zero_grad()
109
+ with flashy.distrib.eager_sync_model(self.adversary):
110
+ loss.backward()
111
+ self.optimizer.step()
112
+
113
+ return loss
114
+
115
+ def forward(self, fake: torch.Tensor, real: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
116
+ """Return the loss for the generator, i.e. trying to fool the adversary,
117
+ and feature matching loss if provided.
118
+ """
119
+ adv = torch.tensor(0., device=fake.device)
120
+ feat = torch.tensor(0., device=fake.device)
121
+ with flashy.utils.readonly(self.adversary):
122
+ all_logits_fake_is_fake, all_fmap_fake = self.get_adversary_pred(fake)
123
+ all_logits_real_is_fake, all_fmap_real = self.get_adversary_pred(real)
124
+ n_sub_adversaries = len(all_logits_fake_is_fake)
125
+ for logit_fake_is_fake in all_logits_fake_is_fake:
126
+ adv += self.loss(logit_fake_is_fake)
127
+ if self.loss_feat:
128
+ for fmap_fake, fmap_real in zip(all_fmap_fake, all_fmap_real):
129
+ feat += self.loss_feat(fmap_fake, fmap_real)
130
+
131
+ if self.normalize:
132
+ adv /= n_sub_adversaries
133
+ feat /= n_sub_adversaries
134
+
135
+ return adv, feat
136
+
137
+
138
+ def get_adv_criterion(loss_type: str) -> tp.Callable:
139
+ assert loss_type in ADVERSARIAL_LOSSES
140
+ if loss_type == 'mse':
141
+ return mse_loss
142
+ elif loss_type == 'hinge':
143
+ return hinge_loss
144
+ elif loss_type == 'hinge2':
145
+ return hinge2_loss
146
+ raise ValueError('Unsupported loss')
147
+
148
+
149
+ def get_fake_criterion(loss_type: str) -> tp.Callable:
150
+ assert loss_type in ADVERSARIAL_LOSSES
151
+ if loss_type == 'mse':
152
+ return mse_fake_loss
153
+ elif loss_type in ['hinge', 'hinge2']:
154
+ return hinge_fake_loss
155
+ raise ValueError('Unsupported loss')
156
+
157
+
158
+ def get_real_criterion(loss_type: str) -> tp.Callable:
159
+ assert loss_type in ADVERSARIAL_LOSSES
160
+ if loss_type == 'mse':
161
+ return mse_real_loss
162
+ elif loss_type in ['hinge', 'hinge2']:
163
+ return hinge_real_loss
164
+ raise ValueError('Unsupported loss')
165
+
166
+
167
+ def mse_real_loss(x: torch.Tensor) -> torch.Tensor:
168
+ return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
169
+
170
+
171
+ def mse_fake_loss(x: torch.Tensor) -> torch.Tensor:
172
+ return F.mse_loss(x, torch.tensor(0., device=x.device).expand_as(x))
173
+
174
+
175
+ def hinge_real_loss(x: torch.Tensor) -> torch.Tensor:
176
+ return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
177
+
178
+
179
+ def hinge_fake_loss(x: torch.Tensor) -> torch.Tensor:
180
+ return -torch.mean(torch.min(-x - 1, torch.tensor(0., device=x.device).expand_as(x)))
181
+
182
+
183
+ def mse_loss(x: torch.Tensor) -> torch.Tensor:
184
+ if x.numel() == 0:
185
+ return torch.tensor([0.0], device=x.device)
186
+ return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
187
+
188
+
189
+ def hinge_loss(x: torch.Tensor) -> torch.Tensor:
190
+ if x.numel() == 0:
191
+ return torch.tensor([0.0], device=x.device)
192
+ return -x.mean()
193
+
194
+
195
+ def hinge2_loss(x: torch.Tensor) -> torch.Tensor:
196
+ if x.numel() == 0:
197
+ return torch.tensor([0.0])
198
+ return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
199
+
200
+
201
+ class FeatureMatchingLoss(nn.Module):
202
+ """Feature matching loss for adversarial training.
203
+
204
+ Args:
205
+ loss (nn.Module): Loss to use for feature matching (default=torch.nn.L1).
206
+ normalize (bool): Whether to normalize the loss.
207
+ by number of feature maps.
208
+ """
209
+ def __init__(self, loss: nn.Module = torch.nn.L1Loss(), normalize: bool = True):
210
+ super().__init__()
211
+ self.loss = loss
212
+ self.normalize = normalize
213
+
214
+ def forward(self, fmap_fake: tp.List[torch.Tensor], fmap_real: tp.List[torch.Tensor]) -> torch.Tensor:
215
+ assert len(fmap_fake) == len(fmap_real) and len(fmap_fake) > 0
216
+ feat_loss = torch.tensor(0., device=fmap_fake[0].device)
217
+ feat_scale = torch.tensor(0., device=fmap_fake[0].device)
218
+ n_fmaps = 0
219
+ for (feat_fake, feat_real) in zip(fmap_fake, fmap_real):
220
+ assert feat_fake.shape == feat_real.shape
221
+ n_fmaps += 1
222
+ feat_loss += self.loss(feat_fake, feat_real)
223
+ feat_scale += torch.mean(torch.abs(feat_real))
224
+
225
+ if self.normalize:
226
+ feat_loss /= n_fmaps
227
+
228
+ return feat_loss
audiocraft/data/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Audio loading and writing support. Datasets for raw audio
7
+ or also including some metadata."""
8
+
9
+ # flake8: noqa
10
+ from . import audio, audio_dataset, info_audio_dataset, music_dataset, sound_dataset
audiocraft/data/audio.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Audio IO methods are defined in this module (info, read, write),
9
+ We rely on av library for faster read when possible, otherwise on torchaudio.
10
+ """
11
+
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ import logging
15
+ import typing as tp
16
+
17
+ import numpy as np
18
+ import soundfile
19
+ import torch
20
+ from torch.nn import functional as F
21
+ import torchaudio as ta
22
+
23
+ import av
24
+
25
+ from .audio_utils import f32_pcm, i16_pcm, normalize_audio
26
+
27
+
28
+ _av_initialized = False
29
+
30
+
31
+ def _init_av():
32
+ global _av_initialized
33
+ if _av_initialized:
34
+ return
35
+ logger = logging.getLogger('libav.mp3')
36
+ logger.setLevel(logging.ERROR)
37
+ _av_initialized = True
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class AudioFileInfo:
42
+ sample_rate: int
43
+ duration: float
44
+ channels: int
45
+
46
+
47
+ def _av_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
48
+ _init_av()
49
+ with av.open(str(filepath)) as af:
50
+ stream = af.streams.audio[0]
51
+ sample_rate = stream.codec_context.sample_rate
52
+ duration = float(stream.duration * stream.time_base)
53
+ channels = stream.channels
54
+ return AudioFileInfo(sample_rate, duration, channels)
55
+
56
+
57
+ def _soundfile_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
58
+ info = soundfile.info(filepath)
59
+ return AudioFileInfo(info.samplerate, info.duration, info.channels)
60
+
61
+
62
+ def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
63
+ # torchaudio no longer returns useful duration informations for some formats like mp3s.
64
+ filepath = Path(filepath)
65
+ if filepath.suffix in ['.flac', '.ogg']: # TODO: Validate .ogg can be safely read with av_info
66
+ # ffmpeg has some weird issue with flac.
67
+ return _soundfile_info(filepath)
68
+ else:
69
+ return _av_info(filepath)
70
+
71
+
72
+ def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -> tp.Tuple[torch.Tensor, int]:
73
+ """FFMPEG-based audio file reading using PyAV bindings.
74
+ Soundfile cannot read mp3 and av_read is more efficient than torchaudio.
75
+
76
+ Args:
77
+ filepath (str or Path): Path to audio file to read.
78
+ seek_time (float): Time at which to start reading in the file.
79
+ duration (float): Duration to read from the file. If set to -1, the whole file is read.
80
+ Returns:
81
+ tuple of torch.Tensor, int: Tuple containing audio data and sample rate
82
+ """
83
+ _init_av()
84
+ with av.open(str(filepath)) as af:
85
+ stream = af.streams.audio[0]
86
+ sr = stream.codec_context.sample_rate
87
+ num_frames = int(sr * duration) if duration >= 0 else -1
88
+ frame_offset = int(sr * seek_time)
89
+ # we need a small negative offset otherwise we get some edge artifact
90
+ # from the mp3 decoder.
91
+ af.seek(int(max(0, (seek_time - 0.1)) / stream.time_base), stream=stream)
92
+ frames = []
93
+ length = 0
94
+ for frame in af.decode(streams=stream.index):
95
+ current_offset = int(frame.rate * frame.pts * frame.time_base)
96
+ strip = max(0, frame_offset - current_offset)
97
+ buf = torch.from_numpy(frame.to_ndarray())
98
+ if buf.shape[0] != stream.channels:
99
+ buf = buf.view(-1, stream.channels).t()
100
+ buf = buf[:, strip:]
101
+ frames.append(buf)
102
+ length += buf.shape[1]
103
+ if num_frames > 0 and length >= num_frames:
104
+ break
105
+ assert frames
106
+ # If the above assert fails, it is likely because we seeked past the end of file point,
107
+ # in which case ffmpeg returns a single frame with only zeros, and a weird timestamp.
108
+ # This will need proper debugging, in due time.
109
+ wav = torch.cat(frames, dim=1)
110
+ assert wav.shape[0] == stream.channels
111
+ if num_frames > 0:
112
+ wav = wav[:, :num_frames]
113
+ return f32_pcm(wav), sr
114
+
115
+
116
+ def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
117
+ duration: float = -1., pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
118
+ """Read audio by picking the most appropriate backend tool based on the audio format.
119
+
120
+ Args:
121
+ filepath (str or Path): Path to audio file to read.
122
+ seek_time (float): Time at which to start reading in the file.
123
+ duration (float): Duration to read from the file. If set to -1, the whole file is read.
124
+ pad (bool): Pad output audio if not reaching expected duration.
125
+ Returns:
126
+ tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
127
+ """
128
+ fp = Path(filepath)
129
+ if fp.suffix in ['.flac', '.ogg']: # TODO: check if we can safely use av_read for .ogg
130
+ # There is some bug with ffmpeg and reading flac
131
+ info = _soundfile_info(filepath)
132
+ frames = -1 if duration <= 0 else int(duration * info.sample_rate)
133
+ frame_offset = int(seek_time * info.sample_rate)
134
+ wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
135
+ assert info.sample_rate == sr, f"Mismatch of sample rates {info.sample_rate} {sr}"
136
+ wav = torch.from_numpy(wav).t().contiguous()
137
+ if len(wav.shape) == 1:
138
+ wav = torch.unsqueeze(wav, 0)
139
+ elif (
140
+ fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
141
+ and duration <= 0 and seek_time == 0
142
+ ):
143
+ # Torchaudio is faster if we load an entire file at once.
144
+ wav, sr = ta.load(fp)
145
+ else:
146
+ wav, sr = _av_read(filepath, seek_time, duration)
147
+ if pad and duration > 0:
148
+ expected_frames = int(duration * sr)
149
+ wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
150
+ return wav, sr
151
+
152
+
153
+ def audio_write(stem_name: tp.Union[str, Path],
154
+ wav: torch.Tensor, sample_rate: int,
155
+ format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
156
+ strategy: str = 'peak', peak_clip_headroom_db: float = 1,
157
+ rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
158
+ loudness_compressor: bool = False,
159
+ log_clipping: bool = True, make_parent_dir: bool = True,
160
+ add_suffix: bool = True) -> Path:
161
+ """Convenience function for saving audio to disk. Returns the filename the audio was written to.
162
+
163
+ Args:
164
+ stem_name (str or Path): Filename without extension which will be added automatically.
165
+ format (str): Either "wav" or "mp3".
166
+ mp3_rate (int): kbps when using mp3s.
167
+ normalize (bool): if `True` (default), normalizes according to the prescribed
168
+ strategy (see after). If `False`, the strategy is only used in case clipping
169
+ would happen.
170
+ strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
171
+ i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
172
+ with extra headroom to avoid clipping. 'clip' just clips.
173
+ peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
174
+ rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
175
+ than the `peak_clip` one to avoid further clipping.
176
+ loudness_headroom_db (float): Target loudness for loudness normalization.
177
+ loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
178
+ when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still
179
+ occurs despite strategy (only for 'rms').
180
+ make_parent_dir (bool): Make parent directory if it doesn't exist.
181
+ Returns:
182
+ Path: Path of the saved audio.
183
+ """
184
+ assert wav.dtype.is_floating_point, "wav is not floating point"
185
+ if wav.dim() == 1:
186
+ wav = wav[None]
187
+ elif wav.dim() > 2:
188
+ raise ValueError("Input wav should be at most 2 dimension.")
189
+ assert wav.isfinite().all()
190
+ wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
191
+ rms_headroom_db, loudness_headroom_db, loudness_compressor,
192
+ log_clipping=log_clipping, sample_rate=sample_rate,
193
+ stem_name=str(stem_name))
194
+ kwargs: dict = {}
195
+ if format == 'mp3':
196
+ suffix = '.mp3'
197
+ kwargs.update({"compression": mp3_rate})
198
+ elif format == 'wav':
199
+ wav = i16_pcm(wav)
200
+ suffix = '.wav'
201
+ kwargs.update({"encoding": "PCM_S", "bits_per_sample": 16})
202
+ else:
203
+ raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
204
+ if not add_suffix:
205
+ suffix = ''
206
+ path = Path(str(stem_name) + suffix)
207
+ if make_parent_dir:
208
+ path.parent.mkdir(exist_ok=True, parents=True)
209
+ try:
210
+ ta.save(path, wav, sample_rate, **kwargs)
211
+ except Exception:
212
+ if path.exists():
213
+ # we do not want to leave half written files around.
214
+ path.unlink()
215
+ raise
216
+ return path
audiocraft/data/audio_dataset.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """AudioDataset support. In order to handle a larger number of files
7
+ without having to scan again the folders, we precompute some metadata
8
+ (filename, sample rate, duration), and use that to efficiently sample audio segments.
9
+ """
10
+ import argparse
11
+ import copy
12
+ from concurrent.futures import ThreadPoolExecutor, Future
13
+ from dataclasses import dataclass, fields
14
+ from contextlib import ExitStack
15
+ from functools import lru_cache
16
+ import gzip
17
+ import json
18
+ import logging
19
+ import os
20
+ from pathlib import Path
21
+ import random
22
+ import sys
23
+ import typing as tp
24
+
25
+ import torch
26
+ import torch.nn.functional as F
27
+
28
+ from .audio import audio_read, audio_info
29
+ from .audio_utils import convert_audio
30
+ from .zip import PathInZip
31
+
32
+ try:
33
+ import dora
34
+ except ImportError:
35
+ dora = None # type: ignore
36
+
37
+
38
+ @dataclass(order=True)
39
+ class BaseInfo:
40
+
41
+ @classmethod
42
+ def _dict2fields(cls, dictionary: dict):
43
+ return {
44
+ field.name: dictionary[field.name]
45
+ for field in fields(cls) if field.name in dictionary
46
+ }
47
+
48
+ @classmethod
49
+ def from_dict(cls, dictionary: dict):
50
+ _dictionary = cls._dict2fields(dictionary)
51
+ return cls(**_dictionary)
52
+
53
+ def to_dict(self):
54
+ return {
55
+ field.name: self.__getattribute__(field.name)
56
+ for field in fields(self)
57
+ }
58
+
59
+
60
+ @dataclass(order=True)
61
+ class AudioMeta(BaseInfo):
62
+ path: str
63
+ duration: float
64
+ sample_rate: int
65
+ amplitude: tp.Optional[float] = None
66
+ weight: tp.Optional[float] = None
67
+ # info_path is used to load additional information about the audio file that is stored in zip files.
68
+ info_path: tp.Optional[PathInZip] = None
69
+
70
+ @classmethod
71
+ def from_dict(cls, dictionary: dict):
72
+ base = cls._dict2fields(dictionary)
73
+ if 'info_path' in base and base['info_path'] is not None:
74
+ base['info_path'] = PathInZip(base['info_path'])
75
+ return cls(**base)
76
+
77
+ def to_dict(self):
78
+ d = super().to_dict()
79
+ if d['info_path'] is not None:
80
+ d['info_path'] = str(d['info_path'])
81
+ return d
82
+
83
+
84
+ @dataclass(order=True)
85
+ class SegmentInfo(BaseInfo):
86
+ meta: AudioMeta
87
+ seek_time: float
88
+ # The following values are given once the audio is processed, e.g.
89
+ # at the target sample rate and target number of channels.
90
+ n_frames: int # actual number of frames without padding
91
+ total_frames: int # total number of frames, padding included
92
+ sample_rate: int # actual sample rate
93
+ channels: int # number of audio channels.
94
+
95
+
96
+ DEFAULT_EXTS = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
97
+
98
+ logger = logging.getLogger(__name__)
99
+
100
+
101
+ def _get_audio_meta(file_path: str, minimal: bool = True) -> AudioMeta:
102
+ """AudioMeta from a path to an audio file.
103
+
104
+ Args:
105
+ file_path (str): Resolved path of valid audio file.
106
+ minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
107
+ Returns:
108
+ AudioMeta: Audio file path and its metadata.
109
+ """
110
+ info = audio_info(file_path)
111
+ amplitude: tp.Optional[float] = None
112
+ if not minimal:
113
+ wav, sr = audio_read(file_path)
114
+ amplitude = wav.abs().max().item()
115
+ return AudioMeta(file_path, info.duration, info.sample_rate, amplitude)
116
+
117
+
118
+ def _resolve_audio_meta(m: AudioMeta, fast: bool = True) -> AudioMeta:
119
+ """If Dora is available as a dependency, try to resolve potential relative paths
120
+ in list of AudioMeta. This method is expected to be used when loading meta from file.
121
+
122
+ Args:
123
+ m (AudioMeta): Audio meta to resolve.
124
+ fast (bool): If True, uses a really fast check for determining if a file
125
+ is already absolute or not. Only valid on Linux/Mac.
126
+ Returns:
127
+ AudioMeta: Audio meta with resolved path.
128
+ """
129
+ def is_abs(m):
130
+ if fast:
131
+ return str(m)[0] == '/'
132
+ else:
133
+ os.path.isabs(str(m))
134
+
135
+ if not dora:
136
+ return m
137
+
138
+ if not is_abs(m.path):
139
+ m.path = dora.git_save.to_absolute_path(m.path)
140
+ if m.info_path is not None and not is_abs(m.info_path.zip_path):
141
+ m.info_path.zip_path = dora.git_save.to_absolute_path(m.path)
142
+ return m
143
+
144
+
145
+ def find_audio_files(path: tp.Union[Path, str],
146
+ exts: tp.List[str] = DEFAULT_EXTS,
147
+ resolve: bool = True,
148
+ minimal: bool = True,
149
+ progress: bool = False,
150
+ workers: int = 0) -> tp.List[AudioMeta]:
151
+ """Build a list of AudioMeta from a given path,
152
+ collecting relevant audio files and fetching meta info.
153
+
154
+ Args:
155
+ path (str or Path): Path to folder containing audio files.
156
+ exts (list of str): List of file extensions to consider for audio files.
157
+ minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
158
+ progress (bool): Whether to log progress on audio files collection.
159
+ workers (int): number of parallel workers, if 0, use only the current thread.
160
+ Returns:
161
+ list of AudioMeta: List of audio file path and its metadata.
162
+ """
163
+ audio_files = []
164
+ futures: tp.List[Future] = []
165
+ pool: tp.Optional[ThreadPoolExecutor] = None
166
+ with ExitStack() as stack:
167
+ if workers > 0:
168
+ pool = ThreadPoolExecutor(workers)
169
+ stack.enter_context(pool)
170
+
171
+ if progress:
172
+ print("Finding audio files...")
173
+ for root, folders, files in os.walk(path, followlinks=True):
174
+ for file in files:
175
+ full_path = Path(root) / file
176
+ if full_path.suffix.lower() in exts:
177
+ audio_files.append(full_path)
178
+ if pool is not None:
179
+ futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
180
+ if progress:
181
+ print(format(len(audio_files), " 8d"), end='\r', file=sys.stderr)
182
+
183
+ if progress:
184
+ print("Getting audio metadata...")
185
+ meta: tp.List[AudioMeta] = []
186
+ for idx, file_path in enumerate(audio_files):
187
+ try:
188
+ if pool is None:
189
+ m = _get_audio_meta(str(file_path), minimal)
190
+ else:
191
+ m = futures[idx].result()
192
+ if resolve:
193
+ m = _resolve_audio_meta(m)
194
+ except Exception as err:
195
+ print("Error with", str(file_path), err, file=sys.stderr)
196
+ continue
197
+ meta.append(m)
198
+ if progress:
199
+ print(format((1 + idx) / len(audio_files), " 3.1%"), end='\r', file=sys.stderr)
200
+ meta.sort()
201
+ return meta
202
+
203
+
204
+ def load_audio_meta(path: tp.Union[str, Path],
205
+ resolve: bool = True, fast: bool = True) -> tp.List[AudioMeta]:
206
+ """Load list of AudioMeta from an optionally compressed json file.
207
+
208
+ Args:
209
+ path (str or Path): Path to JSON file.
210
+ resolve (bool): Whether to resolve the path from AudioMeta (default=True).
211
+ fast (bool): activates some tricks to make things faster.
212
+ Returns:
213
+ list of AudioMeta: List of audio file path and its total duration.
214
+ """
215
+ open_fn = gzip.open if str(path).lower().endswith('.gz') else open
216
+ with open_fn(path, 'rb') as fp: # type: ignore
217
+ lines = fp.readlines()
218
+ meta = []
219
+ for line in lines:
220
+ d = json.loads(line)
221
+ m = AudioMeta.from_dict(d)
222
+ if resolve:
223
+ m = _resolve_audio_meta(m, fast=fast)
224
+ meta.append(m)
225
+ return meta
226
+
227
+
228
+ def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
229
+ """Save the audio metadata to the file pointer as json.
230
+
231
+ Args:
232
+ path (str or Path): Path to JSON file.
233
+ metadata (list of BaseAudioMeta): List of audio meta to save.
234
+ """
235
+ Path(path).parent.mkdir(exist_ok=True, parents=True)
236
+ open_fn = gzip.open if str(path).lower().endswith('.gz') else open
237
+ with open_fn(path, 'wb') as fp: # type: ignore
238
+ for m in meta:
239
+ json_str = json.dumps(m.to_dict()) + '\n'
240
+ json_bytes = json_str.encode('utf-8')
241
+ fp.write(json_bytes)
242
+
243
+
244
+ class AudioDataset:
245
+ """Base audio dataset.
246
+
247
+ The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
248
+ and potentially additional information, by creating random segments from the list of audio
249
+ files referenced in the metadata and applying minimal data pre-processing such as resampling,
250
+ mixing of channels, padding, etc.
251
+
252
+ If no segment_duration value is provided, the AudioDataset will return the full wav for each
253
+ audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
254
+ duration, applying padding if required.
255
+
256
+ By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
257
+ allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
258
+ original audio meta.
259
+
260
+ Note that you can call `start_epoch(epoch)` in order to get
261
+ a deterministic "randomization" for `shuffle=True`.
262
+ For a given epoch and dataset index, this will always return the same extract.
263
+ You can get back some diversity by setting the `shuffle_seed` param.
264
+
265
+ Args:
266
+ meta (list of AudioMeta): List of audio files metadata.
267
+ segment_duration (float, optional): Optional segment duration of audio to load.
268
+ If not specified, the dataset will load the full audio segment from the file.
269
+ shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
270
+ sample_rate (int): Target sample rate of the loaded audio samples.
271
+ channels (int): Target number of channels of the loaded audio samples.
272
+ sample_on_duration (bool): Set to `True` to sample segments with probability
273
+ dependent on audio file duration. This is only used if `segment_duration` is provided.
274
+ sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
275
+ `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
276
+ of the file duration and file weight. This is only used if `segment_duration` is provided.
277
+ min_segment_ratio (float): Minimum segment ratio to use when the audio file
278
+ is shorter than the desired segment.
279
+ max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
280
+ return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
281
+ min_audio_duration (float, optional): Minimum audio file duration, in seconds, if provided
282
+ audio shorter than this will be filtered out.
283
+ max_audio_duration (float, optional): Maximal audio file duration in seconds, if provided
284
+ audio longer than this will be filtered out.
285
+ shuffle_seed (int): can be used to further randomize
286
+ load_wav (bool): if False, skip loading the wav but returns a tensor of 0
287
+ with the expected segment_duration (which must be provided if load_wav is False).
288
+ permutation_on_files (bool): only if `sample_on_weight` and `sample_on_duration`
289
+ are False. Will ensure a permutation on files when going through the dataset.
290
+ In that case the epoch number must be provided in order for the model
291
+ to continue the permutation across epochs. In that case, it is assumed
292
+ that `num_samples = total_batch_size * num_updates_per_epoch`, with
293
+ `total_batch_size` the overall batch size accounting for all gpus.
294
+ """
295
+ def __init__(self,
296
+ meta: tp.List[AudioMeta],
297
+ segment_duration: tp.Optional[float] = None,
298
+ shuffle: bool = True,
299
+ num_samples: int = 10_000,
300
+ sample_rate: int = 48_000,
301
+ channels: int = 2,
302
+ pad: bool = True,
303
+ sample_on_duration: bool = True,
304
+ sample_on_weight: bool = True,
305
+ min_segment_ratio: float = 0.5,
306
+ max_read_retry: int = 10,
307
+ return_info: bool = False,
308
+ min_audio_duration: tp.Optional[float] = None,
309
+ max_audio_duration: tp.Optional[float] = None,
310
+ shuffle_seed: int = 0,
311
+ load_wav: bool = True,
312
+ permutation_on_files: bool = False,
313
+ ):
314
+ assert len(meta) > 0, "No audio meta provided to AudioDataset. Please check loading of audio meta."
315
+ assert segment_duration is None or segment_duration > 0
316
+ assert segment_duration is None or min_segment_ratio >= 0
317
+ self.segment_duration = segment_duration
318
+ self.min_segment_ratio = min_segment_ratio
319
+ self.max_audio_duration = max_audio_duration
320
+ self.min_audio_duration = min_audio_duration
321
+ if self.min_audio_duration is not None and self.max_audio_duration is not None:
322
+ assert self.min_audio_duration <= self.max_audio_duration
323
+ self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
324
+ assert len(self.meta) # Fail fast if all data has been filtered.
325
+ self.total_duration = sum(d.duration for d in self.meta)
326
+
327
+ if segment_duration is None:
328
+ num_samples = len(self.meta)
329
+ self.num_samples = num_samples
330
+ self.shuffle = shuffle
331
+ self.sample_rate = sample_rate
332
+ self.channels = channels
333
+ self.pad = pad
334
+ self.sample_on_weight = sample_on_weight
335
+ self.sample_on_duration = sample_on_duration
336
+ self.sampling_probabilities = self._get_sampling_probabilities()
337
+ self.max_read_retry = max_read_retry
338
+ self.return_info = return_info
339
+ self.shuffle_seed = shuffle_seed
340
+ self.current_epoch: tp.Optional[int] = None
341
+ self.load_wav = load_wav
342
+ if not load_wav:
343
+ assert segment_duration is not None
344
+ self.permutation_on_files = permutation_on_files
345
+ if permutation_on_files:
346
+ assert not self.sample_on_duration
347
+ assert not self.sample_on_weight
348
+ assert self.shuffle
349
+
350
+ def start_epoch(self, epoch: int):
351
+ self.current_epoch = epoch
352
+
353
+ def __len__(self):
354
+ return self.num_samples
355
+
356
+ def _get_sampling_probabilities(self, normalized: bool = True):
357
+ """Return the sampling probabilities for each file inside `self.meta`."""
358
+ scores: tp.List[float] = []
359
+ for file_meta in self.meta:
360
+ score = 1.
361
+ if self.sample_on_weight and file_meta.weight is not None:
362
+ score *= file_meta.weight
363
+ if self.sample_on_duration:
364
+ score *= file_meta.duration
365
+ scores.append(score)
366
+ probabilities = torch.tensor(scores)
367
+ if normalized:
368
+ probabilities /= probabilities.sum()
369
+ return probabilities
370
+
371
+ @staticmethod
372
+ @lru_cache(16)
373
+ def _get_file_permutation(num_files: int, permutation_index: int, base_seed: int):
374
+ # Used to keep the most recent files permutation in memory implicitely.
375
+ # will work unless someone is using a lot of Datasets in parallel.
376
+ rng = torch.Generator()
377
+ rng.manual_seed(base_seed + permutation_index)
378
+ return torch.randperm(num_files, generator=rng)
379
+
380
+ def sample_file(self, index: int, rng: torch.Generator) -> AudioMeta:
381
+ """Sample a given file from `self.meta`. Can be overridden in subclasses.
382
+ This is only called if `segment_duration` is not None.
383
+
384
+ You must use the provided random number generator `rng` for reproducibility.
385
+ You can further make use of the index accessed.
386
+ """
387
+ if self.permutation_on_files:
388
+ assert self.current_epoch is not None
389
+ total_index = self.current_epoch * len(self) + index
390
+ permutation_index = total_index // len(self.meta)
391
+ relative_index = total_index % len(self.meta)
392
+ permutation = AudioDataset._get_file_permutation(
393
+ len(self.meta), permutation_index, self.shuffle_seed)
394
+ file_index = permutation[relative_index]
395
+ return self.meta[file_index]
396
+
397
+ if not self.sample_on_weight and not self.sample_on_duration:
398
+ file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
399
+ else:
400
+ file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
401
+
402
+ return self.meta[file_index]
403
+
404
+ def _audio_read(self, path: str, seek_time: float = 0, duration: float = -1):
405
+ # Override this method in subclass if needed.
406
+ if self.load_wav:
407
+ return audio_read(path, seek_time, duration, pad=False)
408
+ else:
409
+ assert self.segment_duration is not None
410
+ n_frames = int(self.sample_rate * self.segment_duration)
411
+ return torch.zeros(self.channels, n_frames), self.sample_rate
412
+
413
+ def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
414
+ if self.segment_duration is None:
415
+ file_meta = self.meta[index]
416
+ out, sr = audio_read(file_meta.path)
417
+ out = convert_audio(out, sr, self.sample_rate, self.channels)
418
+ n_frames = out.shape[-1]
419
+ segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
420
+ sample_rate=self.sample_rate, channels=out.shape[0])
421
+ else:
422
+ rng = torch.Generator()
423
+ if self.shuffle:
424
+ # We use index, plus extra randomness, either totally random if we don't know the epoch.
425
+ # otherwise we make use of the epoch number and optional shuffle_seed.
426
+ if self.current_epoch is None:
427
+ rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
428
+ else:
429
+ rng.manual_seed(index + self.num_samples * (self.current_epoch + self.shuffle_seed))
430
+ else:
431
+ # We only use index
432
+ rng.manual_seed(index)
433
+
434
+ for retry in range(self.max_read_retry):
435
+ file_meta = self.sample_file(index, rng)
436
+ # We add some variance in the file position even if audio file is smaller than segment
437
+ # without ending up with empty segments
438
+ max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
439
+ seek_time = torch.rand(1, generator=rng).item() * max_seek
440
+ try:
441
+ out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
442
+ out = convert_audio(out, sr, self.sample_rate, self.channels)
443
+ n_frames = out.shape[-1]
444
+ target_frames = int(self.segment_duration * self.sample_rate)
445
+ if self.pad:
446
+ out = F.pad(out, (0, target_frames - n_frames))
447
+ segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
448
+ sample_rate=self.sample_rate, channels=out.shape[0])
449
+ except Exception as exc:
450
+ logger.warning("Error opening file %s: %r", file_meta.path, exc)
451
+ if retry == self.max_read_retry - 1:
452
+ raise
453
+ else:
454
+ break
455
+
456
+ if self.return_info:
457
+ # Returns the wav and additional information on the wave segment
458
+ return out, segment_info
459
+ else:
460
+ return out
461
+
462
+ def collater(self, samples):
463
+ """The collater function has to be provided to the dataloader
464
+ if AudioDataset has return_info=True in order to properly collate
465
+ the samples of a batch.
466
+ """
467
+ if self.segment_duration is None and len(samples) > 1:
468
+ assert self.pad, "Must allow padding when batching examples of different durations."
469
+
470
+ # In this case the audio reaching the collater is of variable length as segment_duration=None.
471
+ to_pad = self.segment_duration is None and self.pad
472
+ if to_pad:
473
+ max_len = max([wav.shape[-1] for wav, _ in samples])
474
+
475
+ def _pad_wav(wav):
476
+ return F.pad(wav, (0, max_len - wav.shape[-1]))
477
+
478
+ if self.return_info:
479
+ if len(samples) > 0:
480
+ assert len(samples[0]) == 2
481
+ assert isinstance(samples[0][0], torch.Tensor)
482
+ assert isinstance(samples[0][1], SegmentInfo)
483
+
484
+ wavs = [wav for wav, _ in samples]
485
+ segment_infos = [copy.deepcopy(info) for _, info in samples]
486
+
487
+ if to_pad:
488
+ # Each wav could be of a different duration as they are not segmented.
489
+ for i in range(len(samples)):
490
+ # Determines the total length of the signal with padding, so we update here as we pad.
491
+ segment_infos[i].total_frames = max_len
492
+ wavs[i] = _pad_wav(wavs[i])
493
+
494
+ wav = torch.stack(wavs)
495
+ return wav, segment_infos
496
+ else:
497
+ assert isinstance(samples[0], torch.Tensor)
498
+ if to_pad:
499
+ samples = [_pad_wav(s) for s in samples]
500
+ return torch.stack(samples)
501
+
502
+ def _filter_duration(self, meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
503
+ """Filters out audio files with audio durations that will not allow to sample examples from them."""
504
+ orig_len = len(meta)
505
+
506
+ # Filter data that is too short.
507
+ if self.min_audio_duration is not None:
508
+ meta = [m for m in meta if m.duration >= self.min_audio_duration]
509
+
510
+ # Filter data that is too long.
511
+ if self.max_audio_duration is not None:
512
+ meta = [m for m in meta if m.duration <= self.max_audio_duration]
513
+
514
+ filtered_len = len(meta)
515
+ removed_percentage = 100*(1-float(filtered_len)/orig_len)
516
+ msg = 'Removed %.2f percent of the data because it was too short or too long.' % removed_percentage
517
+ if removed_percentage < 10:
518
+ logging.debug(msg)
519
+ else:
520
+ logging.warning(msg)
521
+ return meta
522
+
523
+ @classmethod
524
+ def from_meta(cls, root: tp.Union[str, Path], **kwargs):
525
+ """Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
526
+
527
+ Args:
528
+ root (str or Path): Path to root folder containing audio files.
529
+ kwargs: Additional keyword arguments for the AudioDataset.
530
+ """
531
+ root = Path(root)
532
+ if root.is_dir():
533
+ if (root / 'data.jsonl').exists():
534
+ root = root / 'data.jsonl'
535
+ elif (root / 'data.jsonl.gz').exists():
536
+ root = root / 'data.jsonl.gz'
537
+ else:
538
+ raise ValueError("Don't know where to read metadata from in the dir. "
539
+ "Expecting either a data.jsonl or data.jsonl.gz file but none found.")
540
+ meta = load_audio_meta(root)
541
+ return cls(meta, **kwargs)
542
+
543
+ @classmethod
544
+ def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
545
+ exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
546
+ """Instantiate AudioDataset from a path containing (possibly nested) audio files.
547
+
548
+ Args:
549
+ root (str or Path): Path to root folder containing audio files.
550
+ minimal_meta (bool): Whether to only load minimal metadata or not.
551
+ exts (list of str): Extensions for audio files.
552
+ kwargs: Additional keyword arguments for the AudioDataset.
553
+ """
554
+ root = Path(root)
555
+ if root.is_file():
556
+ meta = load_audio_meta(root, resolve=True)
557
+ else:
558
+ meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
559
+ return cls(meta, **kwargs)
560
+
561
+
562
+ def main():
563
+ logging.basicConfig(stream=sys.stderr, level=logging.INFO)
564
+ parser = argparse.ArgumentParser(
565
+ prog='audio_dataset',
566
+ description='Generate .jsonl files by scanning a folder.')
567
+ parser.add_argument('root', help='Root folder with all the audio files')
568
+ parser.add_argument('output_meta_file',
569
+ help='Output file to store the metadata, ')
570
+ parser.add_argument('--complete',
571
+ action='store_false', dest='minimal', default=True,
572
+ help='Retrieve all metadata, even the one that are expansive '
573
+ 'to compute (e.g. normalization).')
574
+ parser.add_argument('--resolve',
575
+ action='store_true', default=False,
576
+ help='Resolve the paths to be absolute and with no symlinks.')
577
+ parser.add_argument('--workers',
578
+ default=10, type=int,
579
+ help='Number of workers.')
580
+ args = parser.parse_args()
581
+ meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
582
+ resolve=args.resolve, minimal=args.minimal, workers=args.workers)
583
+ save_audio_meta(args.output_meta_file, meta)
584
+
585
+
586
+ if __name__ == '__main__':
587
+ main()
audiocraft/data/audio_utils.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Various utilities for audio convertion (pcm format, sample rate and channels),
7
+ and volume normalization."""
8
+ import sys
9
+ import typing as tp
10
+
11
+ import julius
12
+ import torch
13
+ import torchaudio
14
+
15
+
16
+ def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -> torch.Tensor:
17
+ """Convert audio to the given number of channels.
18
+
19
+ Args:
20
+ wav (torch.Tensor): Audio wave of shape [B, C, T].
21
+ channels (int): Expected number of channels as output.
22
+ Returns:
23
+ torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
24
+ """
25
+ *shape, src_channels, length = wav.shape
26
+ if src_channels == channels:
27
+ pass
28
+ elif channels == 1:
29
+ # Case 1:
30
+ # The caller asked 1-channel audio, and the stream has multiple
31
+ # channels, downmix all channels.
32
+ wav = wav.mean(dim=-2, keepdim=True)
33
+ elif src_channels == 1:
34
+ # Case 2:
35
+ # The caller asked for multiple channels, but the input file has
36
+ # a single channel, replicate the audio over all channels.
37
+ wav = wav.expand(*shape, channels, length)
38
+ elif src_channels >= channels:
39
+ # Case 3:
40
+ # The caller asked for multiple channels, and the input file has
41
+ # more channels than requested. In that case return the first channels.
42
+ wav = wav[..., :channels, :]
43
+ else:
44
+ # Case 4: What is a reasonable choice here?
45
+ raise ValueError('The audio file has less channels than requested but is not mono.')
46
+ return wav
47
+
48
+
49
+ def convert_audio(wav: torch.Tensor, from_rate: float,
50
+ to_rate: float, to_channels: int) -> torch.Tensor:
51
+ """Convert audio to new sample rate and number of audio channels."""
52
+ wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
53
+ wav = convert_audio_channels(wav, to_channels)
54
+ return wav
55
+
56
+
57
+ def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
58
+ loudness_compressor: bool = False, energy_floor: float = 2e-3):
59
+ """Normalize an input signal to a user loudness in dB LKFS.
60
+ Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
61
+
62
+ Args:
63
+ wav (torch.Tensor): Input multichannel audio data.
64
+ sample_rate (int): Sample rate.
65
+ loudness_headroom_db (float): Target loudness of the output in dB LUFS.
66
+ loudness_compressor (bool): Uses tanh for soft clipping.
67
+ energy_floor (float): anything below that RMS level will not be rescaled.
68
+ Returns:
69
+ torch.Tensor: Loudness normalized output data.
70
+ """
71
+ energy = wav.pow(2).mean().sqrt().item()
72
+ if energy < energy_floor:
73
+ return wav
74
+ transform = torchaudio.transforms.Loudness(sample_rate)
75
+ input_loudness_db = transform(wav).item()
76
+ # calculate the gain needed to scale to the desired loudness level
77
+ delta_loudness = -loudness_headroom_db - input_loudness_db
78
+ gain = 10.0 ** (delta_loudness / 20.0)
79
+ output = gain * wav
80
+ if loudness_compressor:
81
+ output = torch.tanh(output)
82
+ assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
83
+ return output
84
+
85
+
86
+ def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optional[str] = None) -> None:
87
+ """Utility function to clip the audio with logging if specified."""
88
+ max_scale = wav.abs().max()
89
+ if log_clipping and max_scale > 1:
90
+ clamp_prob = (wav.abs() > 1).float().mean().item()
91
+ print(f"CLIPPING {stem_name or ''} happening with proba (a bit of clipping is okay):",
92
+ clamp_prob, "maximum scale: ", max_scale.item(), file=sys.stderr)
93
+ wav.clamp_(-1, 1)
94
+
95
+
96
+ def normalize_audio(wav: torch.Tensor, normalize: bool = True,
97
+ strategy: str = 'peak', peak_clip_headroom_db: float = 1,
98
+ rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
99
+ loudness_compressor: bool = False, log_clipping: bool = False,
100
+ sample_rate: tp.Optional[int] = None,
101
+ stem_name: tp.Optional[str] = None) -> torch.Tensor:
102
+ """Normalize the audio according to the prescribed strategy (see after).
103
+
104
+ Args:
105
+ wav (torch.Tensor): Audio data.
106
+ normalize (bool): if `True` (default), normalizes according to the prescribed
107
+ strategy (see after). If `False`, the strategy is only used in case clipping
108
+ would happen.
109
+ strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
110
+ i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
111
+ with extra headroom to avoid clipping. 'clip' just clips.
112
+ peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
113
+ rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
114
+ than the `peak_clip` one to avoid further clipping.
115
+ loudness_headroom_db (float): Target loudness for loudness normalization.
116
+ loudness_compressor (bool): If True, uses tanh based soft clipping.
117
+ log_clipping (bool): If True, basic logging on stderr when clipping still
118
+ occurs despite strategy (only for 'rms').
119
+ sample_rate (int): Sample rate for the audio data (required for loudness).
120
+ stem_name (str, optional): Stem name for clipping logging.
121
+ Returns:
122
+ torch.Tensor: Normalized audio.
123
+ """
124
+ scale_peak = 10 ** (-peak_clip_headroom_db / 20)
125
+ scale_rms = 10 ** (-rms_headroom_db / 20)
126
+ if strategy == 'peak':
127
+ rescaling = (scale_peak / wav.abs().max())
128
+ if normalize or rescaling < 1:
129
+ wav = wav * rescaling
130
+ elif strategy == 'clip':
131
+ wav = wav.clamp(-scale_peak, scale_peak)
132
+ elif strategy == 'rms':
133
+ mono = wav.mean(dim=0)
134
+ rescaling = scale_rms / mono.pow(2).mean().sqrt()
135
+ if normalize or rescaling < 1:
136
+ wav = wav * rescaling
137
+ _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
138
+ elif strategy == 'loudness':
139
+ assert sample_rate is not None, "Loudness normalization requires sample rate."
140
+ wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
141
+ _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
142
+ else:
143
+ assert wav.abs().max() < 1
144
+ assert strategy == '' or strategy == 'none', f"Unexpected strategy: '{strategy}'"
145
+ return wav
146
+
147
+
148
+ def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
149
+ """Convert audio to float 32 bits PCM format.
150
+ """
151
+ if wav.dtype.is_floating_point:
152
+ return wav
153
+ elif wav.dtype == torch.int16:
154
+ return wav.float() / 2**15
155
+ elif wav.dtype == torch.int32:
156
+ return wav.float() / 2**31
157
+ raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
158
+
159
+
160
+ def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
161
+ """Convert audio to int 16 bits PCM format.
162
+
163
+ ..Warning:: There exist many formula for doing this conversion. None are perfect
164
+ due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
165
+ or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
166
+ it is possible that `i16_pcm(f32_pcm)) != Identity`.
167
+ """
168
+ if wav.dtype.is_floating_point:
169
+ assert wav.abs().max() <= 1
170
+ candidate = (wav * 2 ** 15).round()
171
+ if candidate.max() >= 2 ** 15: # clipping would occur
172
+ candidate = (wav * (2 ** 15 - 1)).round()
173
+ return candidate.short()
174
+ else:
175
+ assert wav.dtype == torch.int16
176
+ return wav
audiocraft/data/info_audio_dataset.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Base classes for the datasets that also provide non-audio metadata,
7
+ e.g. description, text transcription etc.
8
+ """
9
+ from dataclasses import dataclass
10
+ import logging
11
+ import math
12
+ import re
13
+ import typing as tp
14
+
15
+ import torch
16
+
17
+ from .audio_dataset import AudioDataset, AudioMeta
18
+ from ..environment import AudioCraftEnvironment
19
+ from ..modules.conditioners import SegmentWithAttributes, ConditioningAttributes
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def _clusterify_meta(meta: AudioMeta) -> AudioMeta:
26
+ """Monkey-patch meta to match cluster specificities."""
27
+ meta.path = AudioCraftEnvironment.apply_dataset_mappers(meta.path)
28
+ if meta.info_path is not None:
29
+ meta.info_path.zip_path = AudioCraftEnvironment.apply_dataset_mappers(meta.info_path.zip_path)
30
+ return meta
31
+
32
+
33
+ def clusterify_all_meta(meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
34
+ """Monkey-patch all meta to match cluster specificities."""
35
+ return [_clusterify_meta(m) for m in meta]
36
+
37
+
38
+ @dataclass
39
+ class AudioInfo(SegmentWithAttributes):
40
+ """Dummy SegmentInfo with empty attributes.
41
+
42
+ The InfoAudioDataset is expected to return metadata that inherits
43
+ from SegmentWithAttributes class and can return conditioning attributes.
44
+
45
+ This basically guarantees all datasets will be compatible with current
46
+ solver that contain conditioners requiring this.
47
+ """
48
+ audio_tokens: tp.Optional[torch.Tensor] = None # populated when using cached batch for training a LM.
49
+
50
+ def to_condition_attributes(self) -> ConditioningAttributes:
51
+ return ConditioningAttributes()
52
+
53
+
54
+ class InfoAudioDataset(AudioDataset):
55
+ """AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.
56
+
57
+ See `audiocraft.data.audio_dataset.AudioDataset` for initialization arguments.
58
+ """
59
+ def __init__(self, meta: tp.List[AudioMeta], **kwargs):
60
+ super().__init__(clusterify_all_meta(meta), **kwargs)
61
+
62
+ def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentWithAttributes]]:
63
+ if not self.return_info:
64
+ wav = super().__getitem__(index)
65
+ assert isinstance(wav, torch.Tensor)
66
+ return wav
67
+ wav, meta = super().__getitem__(index)
68
+ return wav, AudioInfo(**meta.to_dict())
69
+
70
+
71
+ def get_keyword_or_keyword_list(value: tp.Optional[str]) -> tp.Union[tp.Optional[str], tp.Optional[tp.List[str]]]:
72
+ """Preprocess a single keyword or possible a list of keywords."""
73
+ if isinstance(value, list):
74
+ return get_keyword_list(value)
75
+ else:
76
+ return get_keyword(value)
77
+
78
+
79
+ def get_string(value: tp.Optional[str]) -> tp.Optional[str]:
80
+ """Preprocess a single keyword."""
81
+ if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
82
+ return None
83
+ else:
84
+ return value.strip()
85
+
86
+
87
+ def get_keyword(value: tp.Optional[str]) -> tp.Optional[str]:
88
+ """Preprocess a single keyword."""
89
+ if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
90
+ return None
91
+ else:
92
+ return value.strip().lower()
93
+
94
+
95
+ def get_keyword_list(values: tp.Union[str, tp.List[str]]) -> tp.Optional[tp.List[str]]:
96
+ """Preprocess a list of keywords."""
97
+ if isinstance(values, str):
98
+ values = [v.strip() for v in re.split(r'[,\s]', values)]
99
+ elif isinstance(values, float) and math.isnan(values):
100
+ values = []
101
+ if not isinstance(values, list):
102
+ logger.debug(f"Unexpected keyword list {values}")
103
+ values = [str(values)]
104
+
105
+ kws = [get_keyword(v) for v in values]
106
+ kw_list = [k for k in kws if k is not None]
107
+ if len(kw_list) == 0:
108
+ return None
109
+ else:
110
+ return kw_list
audiocraft/data/music_dataset.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Dataset of music tracks with rich metadata.
7
+ """
8
+ from dataclasses import dataclass, field, fields, replace
9
+ import gzip
10
+ import json
11
+ import logging
12
+ from pathlib import Path
13
+ import random
14
+ import typing as tp
15
+
16
+ import torch
17
+
18
+ from .info_audio_dataset import (
19
+ InfoAudioDataset,
20
+ AudioInfo,
21
+ get_keyword_list,
22
+ get_keyword,
23
+ get_string
24
+ )
25
+ from ..modules.conditioners import (
26
+ ConditioningAttributes,
27
+ JointEmbedCondition,
28
+ WavCondition,
29
+ )
30
+ from ..utils.utils import warn_once
31
+
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ @dataclass
37
+ class MusicInfo(AudioInfo):
38
+ """Segment info augmented with music metadata.
39
+ """
40
+ # music-specific metadata
41
+ title: tp.Optional[str] = None
42
+ artist: tp.Optional[str] = None # anonymized artist id, used to ensure no overlap between splits
43
+ key: tp.Optional[str] = None
44
+ bpm: tp.Optional[float] = None
45
+ genre: tp.Optional[str] = None
46
+ moods: tp.Optional[list] = None
47
+ keywords: tp.Optional[list] = None
48
+ description: tp.Optional[str] = None
49
+ name: tp.Optional[str] = None
50
+ instrument: tp.Optional[str] = None
51
+ # original wav accompanying the metadata
52
+ self_wav: tp.Optional[WavCondition] = None
53
+ # dict mapping attributes names to tuple of wav, text and metadata
54
+ joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
55
+
56
+ @property
57
+ def has_music_meta(self) -> bool:
58
+ return self.name is not None
59
+
60
+ def to_condition_attributes(self) -> ConditioningAttributes:
61
+ out = ConditioningAttributes()
62
+ for _field in fields(self):
63
+ key, value = _field.name, getattr(self, _field.name)
64
+ if key == 'self_wav':
65
+ out.wav[key] = value
66
+ elif key == 'joint_embed':
67
+ for embed_attribute, embed_cond in value.items():
68
+ out.joint_embed[embed_attribute] = embed_cond
69
+ else:
70
+ if isinstance(value, list):
71
+ value = ' '.join(value)
72
+ out.text[key] = value
73
+ return out
74
+
75
+ @staticmethod
76
+ def attribute_getter(attribute):
77
+ if attribute == 'bpm':
78
+ preprocess_func = get_bpm
79
+ elif attribute == 'key':
80
+ preprocess_func = get_musical_key
81
+ elif attribute in ['moods', 'keywords']:
82
+ preprocess_func = get_keyword_list
83
+ elif attribute in ['genre', 'name', 'instrument']:
84
+ preprocess_func = get_keyword
85
+ elif attribute in ['title', 'artist', 'description']:
86
+ preprocess_func = get_string
87
+ else:
88
+ preprocess_func = None
89
+ return preprocess_func
90
+
91
+ @classmethod
92
+ def from_dict(cls, dictionary: dict, fields_required: bool = False):
93
+ _dictionary: tp.Dict[str, tp.Any] = {}
94
+
95
+ # allow a subset of attributes to not be loaded from the dictionary
96
+ # these attributes may be populated later
97
+ post_init_attributes = ['self_wav', 'joint_embed']
98
+ optional_fields = ['keywords']
99
+
100
+ for _field in fields(cls):
101
+ if _field.name in post_init_attributes:
102
+ continue
103
+ elif _field.name not in dictionary:
104
+ if fields_required and _field.name not in optional_fields:
105
+ raise KeyError(f"Unexpected missing key: {_field.name}")
106
+ else:
107
+ preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
108
+ value = dictionary[_field.name]
109
+ if preprocess_func:
110
+ value = preprocess_func(value)
111
+ _dictionary[_field.name] = value
112
+ return cls(**_dictionary)
113
+
114
+
115
+ def augment_music_info_description(music_info: MusicInfo, merge_text_p: float = 0.,
116
+ drop_desc_p: float = 0., drop_other_p: float = 0.) -> MusicInfo:
117
+ """Augment MusicInfo description with additional metadata fields and potential dropout.
118
+ Additional textual attributes are added given probability 'merge_text_conditions_p' and
119
+ the original textual description is dropped from the augmented description given probability drop_desc_p.
120
+
121
+ Args:
122
+ music_info (MusicInfo): The music metadata to augment.
123
+ merge_text_p (float): Probability of merging additional metadata to the description.
124
+ If provided value is 0, then no merging is performed.
125
+ drop_desc_p (float): Probability of dropping the original description on text merge.
126
+ if provided value is 0, then no drop out is performed.
127
+ drop_other_p (float): Probability of dropping the other fields used for text augmentation.
128
+ Returns:
129
+ MusicInfo: The MusicInfo with augmented textual description.
130
+ """
131
+ def is_valid_field(field_name: str, field_value: tp.Any) -> bool:
132
+ valid_field_name = field_name in ['key', 'bpm', 'genre', 'moods', 'instrument', 'keywords']
133
+ valid_field_value = field_value is not None and isinstance(field_value, (int, float, str, list))
134
+ keep_field = random.uniform(0, 1) < drop_other_p
135
+ return valid_field_name and valid_field_value and keep_field
136
+
137
+ def process_value(v: tp.Any) -> str:
138
+ if isinstance(v, (int, float, str)):
139
+ return str(v)
140
+ if isinstance(v, list):
141
+ return ", ".join(v)
142
+ else:
143
+ raise ValueError(f"Unknown type for text value! ({type(v), v})")
144
+
145
+ description = music_info.description
146
+
147
+ metadata_text = ""
148
+ if random.uniform(0, 1) < merge_text_p:
149
+ meta_pairs = [f'{_field.name}: {process_value(getattr(music_info, _field.name))}'
150
+ for _field in fields(music_info) if is_valid_field(_field.name, getattr(music_info, _field.name))]
151
+ random.shuffle(meta_pairs)
152
+ metadata_text = ". ".join(meta_pairs)
153
+ description = description if not random.uniform(0, 1) < drop_desc_p else None
154
+ logger.debug(f"Applying text augmentation on MMI info. description: {description}, metadata: {metadata_text}")
155
+
156
+ if description is None:
157
+ description = metadata_text if len(metadata_text) > 1 else None
158
+ else:
159
+ description = ". ".join([description.rstrip('.'), metadata_text])
160
+ description = description.strip() if description else None
161
+
162
+ music_info = replace(music_info)
163
+ music_info.description = description
164
+ return music_info
165
+
166
+
167
+ class Paraphraser:
168
+ def __init__(self, paraphrase_source: tp.Union[str, Path], paraphrase_p: float = 0.):
169
+ self.paraphrase_p = paraphrase_p
170
+ open_fn = gzip.open if str(paraphrase_source).lower().endswith('.gz') else open
171
+ with open_fn(paraphrase_source, 'rb') as f: # type: ignore
172
+ self.paraphrase_source = json.loads(f.read())
173
+ logger.info(f"loaded paraphrasing source from: {paraphrase_source}")
174
+
175
+ def sample_paraphrase(self, audio_path: str, description: str):
176
+ if random.random() >= self.paraphrase_p:
177
+ return description
178
+ info_path = Path(audio_path).with_suffix('.json')
179
+ if info_path not in self.paraphrase_source:
180
+ warn_once(logger, f"{info_path} not in paraphrase source!")
181
+ return description
182
+ new_desc = random.choice(self.paraphrase_source[info_path])
183
+ logger.debug(f"{description} -> {new_desc}")
184
+ return new_desc
185
+
186
+
187
+ class MusicDataset(InfoAudioDataset):
188
+ """Music dataset is an AudioDataset with music-related metadata.
189
+
190
+ Args:
191
+ info_fields_required (bool): Whether to enforce having required fields.
192
+ merge_text_p (float): Probability of merging additional metadata to the description.
193
+ drop_desc_p (float): Probability of dropping the original description on text merge.
194
+ drop_other_p (float): Probability of dropping the other fields used for text augmentation.
195
+ joint_embed_attributes (list[str]): A list of attributes for which joint embedding metadata is returned.
196
+ paraphrase_source (str, optional): Path to the .json or .json.gz file containing the
197
+ paraphrases for the description. The json should be a dict with keys are the
198
+ original info path (e.g. track_path.json) and each value is a list of possible
199
+ paraphrased.
200
+ paraphrase_p (float): probability of taking a paraphrase.
201
+
202
+ See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
203
+ """
204
+ def __init__(self, *args, info_fields_required: bool = True,
205
+ merge_text_p: float = 0., drop_desc_p: float = 0., drop_other_p: float = 0.,
206
+ joint_embed_attributes: tp.List[str] = [],
207
+ paraphrase_source: tp.Optional[str] = None, paraphrase_p: float = 0,
208
+ **kwargs):
209
+ kwargs['return_info'] = True # We require the info for each song of the dataset.
210
+ super().__init__(*args, **kwargs)
211
+ self.info_fields_required = info_fields_required
212
+ self.merge_text_p = merge_text_p
213
+ self.drop_desc_p = drop_desc_p
214
+ self.drop_other_p = drop_other_p
215
+ self.joint_embed_attributes = joint_embed_attributes
216
+ self.paraphraser = None
217
+ if paraphrase_source is not None:
218
+ self.paraphraser = Paraphraser(paraphrase_source, paraphrase_p)
219
+
220
+ def __getitem__(self, index):
221
+ wav, info = super().__getitem__(index)
222
+ info_data = info.to_dict()
223
+ music_info_path = Path(info.meta.path).with_suffix('.json')
224
+
225
+ if Path(music_info_path).exists():
226
+ with open(music_info_path, 'r') as json_file:
227
+ music_data = json.load(json_file)
228
+ music_data.update(info_data)
229
+ music_info = MusicInfo.from_dict(music_data, fields_required=self.info_fields_required)
230
+ if self.paraphraser is not None:
231
+ music_info.description = self.paraphraser.sample(music_info.meta.path, music_info.description)
232
+ if self.merge_text_p:
233
+ music_info = augment_music_info_description(
234
+ music_info, self.merge_text_p, self.drop_desc_p, self.drop_other_p)
235
+ else:
236
+ music_info = MusicInfo.from_dict(info_data, fields_required=False)
237
+
238
+ music_info.self_wav = WavCondition(
239
+ wav=wav[None], length=torch.tensor([info.n_frames]),
240
+ sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
241
+
242
+ for att in self.joint_embed_attributes:
243
+ att_value = getattr(music_info, att)
244
+ joint_embed_cond = JointEmbedCondition(
245
+ wav[None], [att_value], torch.tensor([info.n_frames]),
246
+ sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
247
+ music_info.joint_embed[att] = joint_embed_cond
248
+
249
+ return wav, music_info
250
+
251
+
252
+ def get_musical_key(value: tp.Optional[str]) -> tp.Optional[str]:
253
+ """Preprocess key keywords, discarding them if there are multiple key defined."""
254
+ if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
255
+ return None
256
+ elif ',' in value:
257
+ # For now, we discard when multiple keys are defined separated with comas
258
+ return None
259
+ else:
260
+ return value.strip().lower()
261
+
262
+
263
+ def get_bpm(value: tp.Optional[str]) -> tp.Optional[float]:
264
+ """Preprocess to a float."""
265
+ if value is None:
266
+ return None
267
+ try:
268
+ return float(value)
269
+ except ValueError:
270
+ return None
audiocraft/data/sound_dataset.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Dataset of audio with a simple description.
7
+ """
8
+
9
+ from dataclasses import dataclass, fields, replace
10
+ import json
11
+ from pathlib import Path
12
+ import random
13
+ import typing as tp
14
+
15
+ import numpy as np
16
+ import torch
17
+
18
+ from .info_audio_dataset import (
19
+ InfoAudioDataset,
20
+ get_keyword_or_keyword_list
21
+ )
22
+ from ..modules.conditioners import (
23
+ ConditioningAttributes,
24
+ SegmentWithAttributes,
25
+ WavCondition,
26
+ )
27
+
28
+
29
+ EPS = torch.finfo(torch.float32).eps
30
+ TARGET_LEVEL_LOWER = -35
31
+ TARGET_LEVEL_UPPER = -15
32
+
33
+
34
+ @dataclass
35
+ class SoundInfo(SegmentWithAttributes):
36
+ """Segment info augmented with Sound metadata.
37
+ """
38
+ description: tp.Optional[str] = None
39
+ self_wav: tp.Optional[torch.Tensor] = None
40
+
41
+ @property
42
+ def has_sound_meta(self) -> bool:
43
+ return self.description is not None
44
+
45
+ def to_condition_attributes(self) -> ConditioningAttributes:
46
+ out = ConditioningAttributes()
47
+
48
+ for _field in fields(self):
49
+ key, value = _field.name, getattr(self, _field.name)
50
+ if key == 'self_wav':
51
+ out.wav[key] = value
52
+ else:
53
+ out.text[key] = value
54
+ return out
55
+
56
+ @staticmethod
57
+ def attribute_getter(attribute):
58
+ if attribute == 'description':
59
+ preprocess_func = get_keyword_or_keyword_list
60
+ else:
61
+ preprocess_func = None
62
+ return preprocess_func
63
+
64
+ @classmethod
65
+ def from_dict(cls, dictionary: dict, fields_required: bool = False):
66
+ _dictionary: tp.Dict[str, tp.Any] = {}
67
+
68
+ # allow a subset of attributes to not be loaded from the dictionary
69
+ # these attributes may be populated later
70
+ post_init_attributes = ['self_wav']
71
+
72
+ for _field in fields(cls):
73
+ if _field.name in post_init_attributes:
74
+ continue
75
+ elif _field.name not in dictionary:
76
+ if fields_required:
77
+ raise KeyError(f"Unexpected missing key: {_field.name}")
78
+ else:
79
+ preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
80
+ value = dictionary[_field.name]
81
+ if preprocess_func:
82
+ value = preprocess_func(value)
83
+ _dictionary[_field.name] = value
84
+ return cls(**_dictionary)
85
+
86
+
87
+ class SoundDataset(InfoAudioDataset):
88
+ """Sound audio dataset: Audio dataset with environmental sound-specific metadata.
89
+
90
+ Args:
91
+ info_fields_required (bool): Whether all the mandatory metadata fields should be in the loaded metadata.
92
+ external_metadata_source (tp.Optional[str]): Folder containing JSON metadata for the corresponding dataset.
93
+ The metadata files contained in this folder are expected to match the stem of the audio file with
94
+ a json extension.
95
+ aug_p (float): Probability of performing audio mixing augmentation on the batch.
96
+ mix_p (float): Proportion of batch items that are mixed together when applying audio mixing augmentation.
97
+ mix_snr_low (int): Lowerbound for SNR value sampled for mixing augmentation.
98
+ mix_snr_high (int): Upperbound for SNR value sampled for mixing augmentation.
99
+ mix_min_overlap (float): Minimum overlap between audio files when performing mixing augmentation.
100
+ kwargs: Additional arguments for AudioDataset.
101
+
102
+ See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
103
+ """
104
+ def __init__(
105
+ self,
106
+ *args,
107
+ info_fields_required: bool = True,
108
+ external_metadata_source: tp.Optional[str] = None,
109
+ aug_p: float = 0.,
110
+ mix_p: float = 0.,
111
+ mix_snr_low: int = -5,
112
+ mix_snr_high: int = 5,
113
+ mix_min_overlap: float = 0.5,
114
+ **kwargs
115
+ ):
116
+ kwargs['return_info'] = True # We require the info for each song of the dataset.
117
+ super().__init__(*args, **kwargs)
118
+ self.info_fields_required = info_fields_required
119
+ self.external_metadata_source = external_metadata_source
120
+ self.aug_p = aug_p
121
+ self.mix_p = mix_p
122
+ if self.aug_p > 0:
123
+ assert self.mix_p > 0, "Expecting some mixing proportion mix_p if aug_p > 0"
124
+ assert self.channels == 1, "SoundDataset with audio mixing considers only monophonic audio"
125
+ self.mix_snr_low = mix_snr_low
126
+ self.mix_snr_high = mix_snr_high
127
+ self.mix_min_overlap = mix_min_overlap
128
+
129
+ def _get_info_path(self, path: tp.Union[str, Path]) -> Path:
130
+ """Get path of JSON with metadata (description, etc.).
131
+ If there exists a JSON with the same name as 'path.name', then it will be used.
132
+ Else, such JSON will be searched for in an external json source folder if it exists.
133
+ """
134
+ info_path = Path(path).with_suffix('.json')
135
+ if Path(info_path).exists():
136
+ return info_path
137
+ elif self.external_metadata_source and (Path(self.external_metadata_source) / info_path.name).exists():
138
+ return Path(self.external_metadata_source) / info_path.name
139
+ else:
140
+ raise Exception(f"Unable to find a metadata JSON for path: {path}")
141
+
142
+ def __getitem__(self, index):
143
+ wav, info = super().__getitem__(index)
144
+ info_data = info.to_dict()
145
+ info_path = self._get_info_path(info.meta.path)
146
+ if Path(info_path).exists():
147
+ with open(info_path, 'r') as json_file:
148
+ sound_data = json.load(json_file)
149
+ sound_data.update(info_data)
150
+ sound_info = SoundInfo.from_dict(sound_data, fields_required=self.info_fields_required)
151
+ # if there are multiple descriptions, sample one randomly
152
+ if isinstance(sound_info.description, list):
153
+ sound_info.description = random.choice(sound_info.description)
154
+ else:
155
+ sound_info = SoundInfo.from_dict(info_data, fields_required=False)
156
+
157
+ sound_info.self_wav = WavCondition(
158
+ wav=wav[None], length=torch.tensor([info.n_frames]),
159
+ sample_rate=[sound_info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
160
+
161
+ return wav, sound_info
162
+
163
+ def collater(self, samples):
164
+ # when training, audio mixing is performed in the collate function
165
+ wav, sound_info = super().collater(samples) # SoundDataset always returns infos
166
+ if self.aug_p > 0:
167
+ wav, sound_info = mix_samples(wav, sound_info, self.aug_p, self.mix_p,
168
+ snr_low=self.mix_snr_low, snr_high=self.mix_snr_high,
169
+ min_overlap=self.mix_min_overlap)
170
+ return wav, sound_info
171
+
172
+
173
+ def rms_f(x: torch.Tensor) -> torch.Tensor:
174
+ return (x ** 2).mean(1).pow(0.5)
175
+
176
+
177
+ def normalize(audio: torch.Tensor, target_level: int = -25) -> torch.Tensor:
178
+ """Normalize the signal to the target level."""
179
+ rms = rms_f(audio)
180
+ scalar = 10 ** (target_level / 20) / (rms + EPS)
181
+ audio = audio * scalar.unsqueeze(1)
182
+ return audio
183
+
184
+
185
+ def is_clipped(audio: torch.Tensor, clipping_threshold: float = 0.99) -> torch.Tensor:
186
+ return (abs(audio) > clipping_threshold).any(1)
187
+
188
+
189
+ def mix_pair(src: torch.Tensor, dst: torch.Tensor, min_overlap: float) -> torch.Tensor:
190
+ start = random.randint(0, int(src.shape[1] * (1 - min_overlap)))
191
+ remainder = src.shape[1] - start
192
+ if dst.shape[1] > remainder:
193
+ src[:, start:] = src[:, start:] + dst[:, :remainder]
194
+ else:
195
+ src[:, start:start+dst.shape[1]] = src[:, start:start+dst.shape[1]] + dst
196
+ return src
197
+
198
+
199
+ def snr_mixer(clean: torch.Tensor, noise: torch.Tensor, snr: int, min_overlap: float,
200
+ target_level: int = -25, clipping_threshold: float = 0.99) -> torch.Tensor:
201
+ """Function to mix clean speech and noise at various SNR levels.
202
+
203
+ Args:
204
+ clean (torch.Tensor): Clean audio source to mix, of shape [B, T].
205
+ noise (torch.Tensor): Noise audio source to mix, of shape [B, T].
206
+ snr (int): SNR level when mixing.
207
+ min_overlap (float): Minimum overlap between the two mixed sources.
208
+ target_level (int): Gain level in dB.
209
+ clipping_threshold (float): Threshold for clipping the audio.
210
+ Returns:
211
+ torch.Tensor: The mixed audio, of shape [B, T].
212
+ """
213
+ if clean.shape[1] > noise.shape[1]:
214
+ noise = torch.nn.functional.pad(noise, (0, clean.shape[1] - noise.shape[1]))
215
+ else:
216
+ noise = noise[:, :clean.shape[1]]
217
+
218
+ # normalizing to -25 dB FS
219
+ clean = clean / (clean.max(1)[0].abs().unsqueeze(1) + EPS)
220
+ clean = normalize(clean, target_level)
221
+ rmsclean = rms_f(clean)
222
+
223
+ noise = noise / (noise.max(1)[0].abs().unsqueeze(1) + EPS)
224
+ noise = normalize(noise, target_level)
225
+ rmsnoise = rms_f(noise)
226
+
227
+ # set the noise level for a given SNR
228
+ noisescalar = (rmsclean / (10 ** (snr / 20)) / (rmsnoise + EPS)).unsqueeze(1)
229
+ noisenewlevel = noise * noisescalar
230
+
231
+ # mix noise and clean speech
232
+ noisyspeech = mix_pair(clean, noisenewlevel, min_overlap)
233
+
234
+ # randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
235
+ # there is a chance of clipping that might happen with very less probability, which is not a major issue.
236
+ noisy_rms_level = np.random.randint(TARGET_LEVEL_LOWER, TARGET_LEVEL_UPPER)
237
+ rmsnoisy = rms_f(noisyspeech)
238
+ scalarnoisy = (10 ** (noisy_rms_level / 20) / (rmsnoisy + EPS)).unsqueeze(1)
239
+ noisyspeech = noisyspeech * scalarnoisy
240
+ clean = clean * scalarnoisy
241
+ noisenewlevel = noisenewlevel * scalarnoisy
242
+
243
+ # final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
244
+ clipped = is_clipped(noisyspeech)
245
+ if clipped.any():
246
+ noisyspeech_maxamplevel = noisyspeech[clipped].max(1)[0].abs().unsqueeze(1) / (clipping_threshold - EPS)
247
+ noisyspeech[clipped] = noisyspeech[clipped] / noisyspeech_maxamplevel
248
+
249
+ return noisyspeech
250
+
251
+
252
+ def snr_mix(src: torch.Tensor, dst: torch.Tensor, snr_low: int, snr_high: int, min_overlap: float):
253
+ if snr_low == snr_high:
254
+ snr = snr_low
255
+ else:
256
+ snr = np.random.randint(snr_low, snr_high)
257
+ mix = snr_mixer(src, dst, snr, min_overlap)
258
+ return mix
259
+
260
+
261
+ def mix_text(src_text: str, dst_text: str):
262
+ """Mix text from different sources by concatenating them."""
263
+ if src_text == dst_text:
264
+ return src_text
265
+ return src_text + " " + dst_text
266
+
267
+
268
+ def mix_samples(wavs: torch.Tensor, infos: tp.List[SoundInfo], aug_p: float, mix_p: float,
269
+ snr_low: int, snr_high: int, min_overlap: float):
270
+ """Mix samples within a batch, summing the waveforms and concatenating the text infos.
271
+
272
+ Args:
273
+ wavs (torch.Tensor): Audio tensors of shape [B, C, T].
274
+ infos (list[SoundInfo]): List of SoundInfo items corresponding to the audio.
275
+ aug_p (float): Augmentation probability.
276
+ mix_p (float): Proportion of items in the batch to mix (and merge) together.
277
+ snr_low (int): Lowerbound for sampling SNR.
278
+ snr_high (int): Upperbound for sampling SNR.
279
+ min_overlap (float): Minimum overlap between mixed samples.
280
+ Returns:
281
+ tuple[torch.Tensor, list[SoundInfo]]: A tuple containing the mixed wavs
282
+ and mixed SoundInfo for the given batch.
283
+ """
284
+ # no mixing to perform within the batch
285
+ if mix_p == 0:
286
+ return wavs, infos
287
+
288
+ if random.uniform(0, 1) < aug_p:
289
+ # perform all augmentations on waveforms as [B, T]
290
+ # randomly picking pairs of audio to mix
291
+ assert wavs.size(1) == 1, f"Mix samples requires monophonic audio but C={wavs.size(1)}"
292
+ wavs = wavs.mean(dim=1, keepdim=False)
293
+ B, T = wavs.shape
294
+ k = int(mix_p * B)
295
+ mixed_sources_idx = torch.randperm(B)[:k]
296
+ mixed_targets_idx = torch.randperm(B)[:k]
297
+ aug_wavs = snr_mix(
298
+ wavs[mixed_sources_idx],
299
+ wavs[mixed_targets_idx],
300
+ snr_low,
301
+ snr_high,
302
+ min_overlap,
303
+ )
304
+ # mixing textual descriptions in metadata
305
+ descriptions = [info.description for info in infos]
306
+ aug_infos = []
307
+ for i, j in zip(mixed_sources_idx, mixed_targets_idx):
308
+ text = mix_text(descriptions[i], descriptions[j])
309
+ m = replace(infos[i])
310
+ m.description = text
311
+ aug_infos.append(m)
312
+
313
+ # back to [B, C, T]
314
+ aug_wavs = aug_wavs.unsqueeze(1)
315
+ assert aug_wavs.shape[0] > 0, "Samples mixing returned empty batch."
316
+ assert aug_wavs.dim() == 3, f"Returned wav should be [B, C, T] but dim = {aug_wavs.dim()}"
317
+ assert aug_wavs.shape[0] == len(aug_infos), "Mismatch between number of wavs and infos in the batch"
318
+
319
+ return aug_wavs, aug_infos # [B, C, T]
320
+ else:
321
+ # randomly pick samples in the batch to match
322
+ # the batch size when performing audio mixing
323
+ B, C, T = wavs.shape
324
+ k = int(mix_p * B)
325
+ wav_idx = torch.randperm(B)[:k]
326
+ wavs = wavs[wav_idx]
327
+ infos = [infos[i] for i in wav_idx]
328
+ assert wavs.shape[0] == len(infos), "Mismatch between number of wavs and infos in the batch"
329
+
330
+ return wavs, infos # [B, C, T]
audiocraft/data/zip.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Utility for reading some info from inside a zip file.
7
+ """
8
+
9
+ import typing
10
+ import zipfile
11
+
12
+ from dataclasses import dataclass
13
+ from functools import lru_cache
14
+ from typing_extensions import Literal
15
+
16
+
17
+ DEFAULT_SIZE = 32
18
+ MODE = Literal['r', 'w', 'x', 'a']
19
+
20
+
21
+ @dataclass(order=True)
22
+ class PathInZip:
23
+ """Hold a path of file within a zip file.
24
+
25
+ Args:
26
+ path (str): The convention is <path_to_zip>:<relative_path_inside_zip>.
27
+ Let's assume there is a zip file /some/location/foo.zip
28
+ and inside of it is a json file located at /data/file1.json,
29
+ Then we expect path = "/some/location/foo.zip:/data/file1.json".
30
+ """
31
+
32
+ INFO_PATH_SEP = ':'
33
+ zip_path: str
34
+ file_path: str
35
+
36
+ def __init__(self, path: str) -> None:
37
+ split_path = path.split(self.INFO_PATH_SEP)
38
+ assert len(split_path) == 2
39
+ self.zip_path, self.file_path = split_path
40
+
41
+ @classmethod
42
+ def from_paths(cls, zip_path: str, file_path: str):
43
+ return cls(zip_path + cls.INFO_PATH_SEP + file_path)
44
+
45
+ def __str__(self) -> str:
46
+ return self.zip_path + self.INFO_PATH_SEP + self.file_path
47
+
48
+
49
+ def _open_zip(path: str, mode: MODE = 'r'):
50
+ return zipfile.ZipFile(path, mode)
51
+
52
+
53
+ _cached_open_zip = lru_cache(DEFAULT_SIZE)(_open_zip)
54
+
55
+
56
+ def set_zip_cache_size(max_size: int):
57
+ """Sets the maximal LRU caching for zip file opening.
58
+
59
+ Args:
60
+ max_size (int): the maximal LRU cache.
61
+ """
62
+ global _cached_open_zip
63
+ _cached_open_zip = lru_cache(max_size)(_open_zip)
64
+
65
+
66
+ def open_file_in_zip(path_in_zip: PathInZip, mode: str = 'r') -> typing.IO:
67
+ """Opens a file stored inside a zip and returns a file-like object.
68
+
69
+ Args:
70
+ path_in_zip (PathInZip): A PathInZip object representing the file to return a file-like object of.
71
+ mode (str): The mode in which to open the file with.
72
+ Returns:
73
+ A file-like object for PathInZip.
74
+ """
75
+ zf = _cached_open_zip(path_in_zip.zip_path)
76
+ return zf.open(path_in_zip.file_path)
audiocraft/environment.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Provides cluster and tools configuration across clusters (slurm, dora, utilities).
9
+ """
10
+
11
+ import logging
12
+ import os
13
+ from pathlib import Path
14
+ import re
15
+ import typing as tp
16
+
17
+ import omegaconf
18
+
19
+ from .utils.cluster import _guess_cluster_type
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class AudioCraftEnvironment:
26
+ """Environment configuration for teams and clusters.
27
+
28
+ AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
29
+ or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
30
+ provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
31
+ allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
32
+ map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.
33
+
34
+ The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
35
+ Use the following environment variables to specify the cluster, team or configuration:
36
+
37
+ AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
38
+ cannot be inferred automatically.
39
+ AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
40
+ If not set, configuration is read from config/teams.yaml.
41
+ AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
42
+ Cluster configuration are shared across teams to match compute allocation,
43
+ specify your cluster configuration in the configuration file under a key mapping
44
+ your team name.
45
+ """
46
+ _instance = None
47
+ DEFAULT_TEAM = "default"
48
+
49
+ def __init__(self) -> None:
50
+ """Loads configuration."""
51
+ self.team: str = os.getenv("AUDIOCRAFT_TEAM", self.DEFAULT_TEAM)
52
+ cluster_type = _guess_cluster_type()
53
+ cluster = os.getenv(
54
+ "AUDIOCRAFT_CLUSTER", cluster_type.value
55
+ )
56
+ logger.info("Detecting cluster type %s", cluster_type)
57
+
58
+ self.cluster: str = cluster
59
+
60
+ config_path = os.getenv(
61
+ "AUDIOCRAFT_CONFIG",
62
+ Path(__file__)
63
+ .parent.parent.joinpath("config/teams", self.team)
64
+ .with_suffix(".yaml"),
65
+ )
66
+ self.config = omegaconf.OmegaConf.load(config_path)
67
+ self._dataset_mappers = []
68
+ cluster_config = self._get_cluster_config()
69
+ if "dataset_mappers" in cluster_config:
70
+ for pattern, repl in cluster_config["dataset_mappers"].items():
71
+ regex = re.compile(pattern)
72
+ self._dataset_mappers.append((regex, repl))
73
+
74
+ def _get_cluster_config(self) -> omegaconf.DictConfig:
75
+ assert isinstance(self.config, omegaconf.DictConfig)
76
+ return self.config[self.cluster]
77
+
78
+ @classmethod
79
+ def instance(cls):
80
+ if cls._instance is None:
81
+ cls._instance = cls()
82
+ return cls._instance
83
+
84
+ @classmethod
85
+ def reset(cls):
86
+ """Clears the environment and forces a reload on next invocation."""
87
+ cls._instance = None
88
+
89
+ @classmethod
90
+ def get_team(cls) -> str:
91
+ """Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
92
+ If not defined, defaults to "labs".
93
+ """
94
+ return cls.instance().team
95
+
96
+ @classmethod
97
+ def get_cluster(cls) -> str:
98
+ """Gets the detected cluster.
99
+ This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
100
+ """
101
+ return cls.instance().cluster
102
+
103
+ @classmethod
104
+ def get_dora_dir(cls) -> Path:
105
+ """Gets the path to the dora directory for the current team and cluster.
106
+ Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
107
+ """
108
+ cluster_config = cls.instance()._get_cluster_config()
109
+ dora_dir = os.getenv("AUDIOCRAFT_DORA_DIR", cluster_config["dora_dir"])
110
+ logger.warning(f"Dora directory: {dora_dir}")
111
+ return Path(dora_dir)
112
+
113
+ @classmethod
114
+ def get_reference_dir(cls) -> Path:
115
+ """Gets the path to the reference directory for the current team and cluster.
116
+ Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
117
+ """
118
+ cluster_config = cls.instance()._get_cluster_config()
119
+ return Path(os.getenv("AUDIOCRAFT_REFERENCE_DIR", cluster_config["reference_dir"]))
120
+
121
+ @classmethod
122
+ def get_slurm_exclude(cls) -> tp.Optional[str]:
123
+ """Get the list of nodes to exclude for that cluster."""
124
+ cluster_config = cls.instance()._get_cluster_config()
125
+ return cluster_config.get("slurm_exclude")
126
+
127
+ @classmethod
128
+ def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -> str:
129
+ """Gets the requested partitions for the current team and cluster as a comma-separated string.
130
+
131
+ Args:
132
+ partition_types (list[str], optional): partition types to retrieve. Values must be
133
+ from ['global', 'team']. If not provided, the global partition is returned.
134
+ """
135
+ if not partition_types:
136
+ partition_types = ["global"]
137
+
138
+ cluster_config = cls.instance()._get_cluster_config()
139
+ partitions = [
140
+ cluster_config["partitions"][partition_type]
141
+ for partition_type in partition_types
142
+ ]
143
+ return ",".join(partitions)
144
+
145
+ @classmethod
146
+ def resolve_reference_path(cls, path: tp.Union[str, Path]) -> Path:
147
+ """Converts reference placeholder in path with configured reference dir to resolve paths.
148
+
149
+ Args:
150
+ path (str or Path): Path to resolve.
151
+ Returns:
152
+ Path: Resolved path.
153
+ """
154
+ path = str(path)
155
+
156
+ if path.startswith("//reference"):
157
+ reference_dir = cls.get_reference_dir()
158
+ logger.warn(f"Reference directory: {reference_dir}")
159
+ assert (
160
+ reference_dir.exists() and reference_dir.is_dir()
161
+ ), f"Reference directory does not exist: {reference_dir}."
162
+ path = re.sub("^//reference", str(reference_dir), path)
163
+
164
+ return Path(path)
165
+
166
+ @classmethod
167
+ def apply_dataset_mappers(cls, path: str) -> str:
168
+ """Applies dataset mapping regex rules as defined in the configuration.
169
+ If no rules are defined, the path is returned as-is.
170
+ """
171
+ instance = cls.instance()
172
+
173
+ for pattern, repl in instance._dataset_mappers:
174
+ path = pattern.sub(repl, path)
175
+
176
+ return path
audiocraft/grids/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Dora Grids."""
audiocraft/grids/_base_explorers.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from abc import ABC, abstractmethod
8
+ import time
9
+ import typing as tp
10
+ from dora import Explorer
11
+ import treetable as tt
12
+
13
+
14
+ def get_sheep_ping(sheep) -> tp.Optional[str]:
15
+ """Return the amount of time since the Sheep made some update
16
+ to its log. Returns a str using the relevant time unit."""
17
+ ping = None
18
+ if sheep.log is not None and sheep.log.exists():
19
+ delta = time.time() - sheep.log.stat().st_mtime
20
+ if delta > 3600 * 24:
21
+ ping = f'{delta / (3600 * 24):.1f}d'
22
+ elif delta > 3600:
23
+ ping = f'{delta / (3600):.1f}h'
24
+ elif delta > 60:
25
+ ping = f'{delta / 60:.1f}m'
26
+ else:
27
+ ping = f'{delta:.1f}s'
28
+ return ping
29
+
30
+
31
+ class BaseExplorer(ABC, Explorer):
32
+ """Base explorer for AudioCraft grids.
33
+
34
+ All task specific solvers are expected to implement the `get_grid_metrics`
35
+ method to specify logic about metrics to display for a given task.
36
+
37
+ If additional stages are used, the child explorer must define how to handle
38
+ these new stages in the `process_history` and `process_sheep` methods.
39
+ """
40
+ def stages(self):
41
+ return ["train", "valid", "evaluate"]
42
+
43
+ def get_grid_meta(self):
44
+ """Returns the list of Meta information to display for each XP/job.
45
+ """
46
+ return [
47
+ tt.leaf("index", align=">"),
48
+ tt.leaf("name", wrap=140),
49
+ tt.leaf("state"),
50
+ tt.leaf("sig", align=">"),
51
+ tt.leaf("sid", align="<"),
52
+ ]
53
+
54
+ @abstractmethod
55
+ def get_grid_metrics(self):
56
+ """Return the metrics that should be displayed in the tracking table.
57
+ """
58
+ ...
59
+
60
+ def process_sheep(self, sheep, history):
61
+ train = {
62
+ "epoch": len(history),
63
+ }
64
+ parts = {"train": train}
65
+ for metrics in history:
66
+ for key, sub in metrics.items():
67
+ part = parts.get(key, {})
68
+ if 'duration' in sub:
69
+ # Convert to minutes for readability.
70
+ sub['duration'] = sub['duration'] / 60.
71
+ part.update(sub)
72
+ parts[key] = part
73
+ ping = get_sheep_ping(sheep)
74
+ if ping is not None:
75
+ for name in self.stages():
76
+ if name not in parts:
77
+ parts[name] = {}
78
+ # Add the ping to each part for convenience.
79
+ parts[name]['ping'] = ping
80
+ return parts
audiocraft/grids/audiogen/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """AudioGen grids."""
audiocraft/grids/audiogen/audiogen_base_16khz.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from ..musicgen._explorers import LMExplorer
8
+ from ...environment import AudioCraftEnvironment
9
+
10
+
11
+ @LMExplorer
12
+ def explorer(launcher):
13
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
14
+ launcher.slurm_(gpus=64, partition=partitions)
15
+ launcher.bind_(solver='audiogen/audiogen_base_16khz')
16
+ # replace this by the desired environmental sound dataset
17
+ launcher.bind_(dset='internal/sounds_16khz')
18
+
19
+ fsdp = {'autocast': False, 'fsdp.use': True}
20
+ medium = {'model/lm/model_scale': 'medium'}
21
+
22
+ launcher.bind_(fsdp)
23
+ launcher(medium)
audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Evaluation with objective metrics for the pretrained AudioGen models.
9
+ This grid takes signature from the training grid and runs evaluation-only stage.
10
+
11
+ When running the grid for the first time, please use:
12
+ REGEN=1 dora grid audiogen.audiogen_pretrained_16khz_eval
13
+ and re-use the REGEN=1 option when the grid is changed to force regenerating it.
14
+
15
+ Note that you need the proper metrics external libraries setup to use all
16
+ the objective metrics activated in this grid. Refer to the README for more information.
17
+ """
18
+
19
+ import os
20
+
21
+ from ..musicgen._explorers import GenerationEvalExplorer
22
+ from ...environment import AudioCraftEnvironment
23
+ from ... import train
24
+
25
+
26
+ def eval(launcher, batch_size: int = 32):
27
+ opts = {
28
+ 'dset': 'audio/audiocaps_16khz',
29
+ 'solver/audiogen/evaluation': 'objective_eval',
30
+ 'execute_only': 'evaluate',
31
+ '+dataset.evaluate.batch_size': batch_size,
32
+ '+metrics.fad.tf.batch_size': 32,
33
+ }
34
+ # binary for FAD computation: replace this path with your own path
35
+ metrics_opts = {
36
+ 'metrics.fad.tf.bin': '/data/home/jadecopet/local/usr/opt/google-research'
37
+ }
38
+ opt1 = {'generate.lm.use_sampling': True, 'generate.lm.top_k': 250, 'generate.lm.top_p': 0.}
39
+ opt2 = {'transformer_lm.two_step_cfg': True}
40
+
41
+ sub = launcher.bind(opts)
42
+ sub.bind_(metrics_opts)
43
+
44
+ # base objective metrics
45
+ sub(opt1, opt2)
46
+
47
+
48
+ @GenerationEvalExplorer
49
+ def explorer(launcher):
50
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
51
+ launcher.slurm_(gpus=4, partition=partitions)
52
+
53
+ if 'REGEN' not in os.environ:
54
+ folder = train.main.dora.dir / 'grids' / __name__.split('.', 2)[-1]
55
+ with launcher.job_array():
56
+ for sig in folder.iterdir():
57
+ if not sig.is_symlink():
58
+ continue
59
+ xp = train.main.get_xp_from_sig(sig.name)
60
+ launcher(xp.argv)
61
+ return
62
+
63
+ audiogen_base = launcher.bind(solver="audiogen/audiogen_base_16khz")
64
+ audiogen_base.bind_({'autocast': False, 'fsdp.use': True})
65
+
66
+ audiogen_base_medium = audiogen_base.bind({'continue_from': '//pretrained/facebook/audiogen-medium'})
67
+ audiogen_base_medium.bind_({'model/lm/model_scale': 'medium'})
68
+ eval(audiogen_base_medium, batch_size=128)
audiocraft/grids/compression/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """EnCodec grids."""
audiocraft/grids/compression/_explorers.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import treetable as tt
8
+
9
+ from .._base_explorers import BaseExplorer
10
+
11
+
12
+ class CompressionExplorer(BaseExplorer):
13
+ eval_metrics = ["sisnr", "visqol"]
14
+
15
+ def stages(self):
16
+ return ["train", "valid", "evaluate"]
17
+
18
+ def get_grid_meta(self):
19
+ """Returns the list of Meta information to display for each XP/job.
20
+ """
21
+ return [
22
+ tt.leaf("index", align=">"),
23
+ tt.leaf("name", wrap=140),
24
+ tt.leaf("state"),
25
+ tt.leaf("sig", align=">"),
26
+ ]
27
+
28
+ def get_grid_metrics(self):
29
+ """Return the metrics that should be displayed in the tracking table.
30
+ """
31
+ return [
32
+ tt.group(
33
+ "train",
34
+ [
35
+ tt.leaf("epoch"),
36
+ tt.leaf("bandwidth", ".2f"),
37
+ tt.leaf("adv", ".4f"),
38
+ tt.leaf("d_loss", ".4f"),
39
+ ],
40
+ align=">",
41
+ ),
42
+ tt.group(
43
+ "valid",
44
+ [
45
+ tt.leaf("bandwidth", ".2f"),
46
+ tt.leaf("adv", ".4f"),
47
+ tt.leaf("msspec", ".4f"),
48
+ tt.leaf("sisnr", ".2f"),
49
+ ],
50
+ align=">",
51
+ ),
52
+ tt.group(
53
+ "evaluate", [tt.leaf(name, ".3f") for name in self.eval_metrics], align=">"
54
+ ),
55
+ ]
audiocraft/grids/compression/debug.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Grid search file, simply list all the exp you want in `explorer`.
9
+ Any new exp added there will be scheduled.
10
+ You can cancel and experiment by commenting its line.
11
+
12
+ This grid is a minimal example for debugging compression task
13
+ and how to override parameters directly in a grid.
14
+ Learn more about dora grids: https://github.com/facebookresearch/dora
15
+ """
16
+
17
+ from ._explorers import CompressionExplorer
18
+ from ...environment import AudioCraftEnvironment
19
+
20
+
21
+ @CompressionExplorer
22
+ def explorer(launcher):
23
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
24
+ launcher.slurm_(gpus=2, partition=partitions)
25
+ launcher.bind_(solver='compression/debug')
26
+
27
+ with launcher.job_array():
28
+ # base debug task using config from solver=compression/debug
29
+ launcher()
30
+ # we can override parameters in the grid to launch additional xps
31
+ launcher({'rvq.bins': 2048, 'rvq.n_q': 4})
audiocraft/grids/compression/encodec_audiogen_16khz.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Grid search file, simply list all the exp you want in `explorer`.
9
+ Any new exp added there will be scheduled.
10
+ You can cancel and experiment by commenting its line.
11
+
12
+ This grid shows how to train the new AudioGen EnCodec model at 16 kHz.
13
+ """
14
+
15
+ from ._explorers import CompressionExplorer
16
+ from ...environment import AudioCraftEnvironment
17
+
18
+
19
+ @CompressionExplorer
20
+ def explorer(launcher):
21
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
22
+ launcher.slurm_(gpus=8, partition=partitions)
23
+ # use configuration for AudioGen's EnCodec model trained on monophonic audio sampled at 16 kHz
24
+ # AudioGen's EnCodec is trained with a total stride of 320 leading to a frame rate of 50 hz
25
+ launcher.bind_(solver='compression/encodec_audiogen_16khz')
26
+ # replace this by the desired sound dataset
27
+ launcher.bind_(dset='internal/sounds_16khz')
28
+ # launch xp
29
+ launcher()
audiocraft/grids/compression/encodec_base_24khz.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Grid search file, simply list all the exp you want in `explorer`.
9
+ Any new exp added there will be scheduled.
10
+ You can cancel and experiment by commenting its line.
11
+
12
+ This grid shows how to train a base causal EnCodec model at 24 kHz.
13
+ """
14
+
15
+ from ._explorers import CompressionExplorer
16
+ from ...environment import AudioCraftEnvironment
17
+
18
+
19
+ @CompressionExplorer
20
+ def explorer(launcher):
21
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
22
+ launcher.slurm_(gpus=8, partition=partitions)
23
+ # base causal EnCodec trained on monophonic audio sampled at 24 kHz
24
+ launcher.bind_(solver='compression/encodec_base_24khz')
25
+ # replace this by the desired dataset
26
+ launcher.bind_(dset='audio/example')
27
+ # launch xp
28
+ launcher()
audiocraft/grids/compression/encodec_musicgen_32khz.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Grid search file, simply list all the exp you want in `explorer`.
9
+ Any new exp added there will be scheduled.
10
+ You can cancel and experiment by commenting its line.
11
+
12
+ This grid shows how to train a MusicGen EnCodec model at 32 kHz.
13
+ """
14
+
15
+ from ._explorers import CompressionExplorer
16
+ from ...environment import AudioCraftEnvironment
17
+
18
+
19
+ @CompressionExplorer
20
+ def explorer(launcher):
21
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
22
+ launcher.slurm_(gpus=8, partition=partitions)
23
+ # use configuration for MusicGen's EnCodec model trained on monophonic audio sampled at 32 kHz
24
+ # MusicGen's EnCodec is trained with a total stride of 640 leading to a frame rate of 50 hz
25
+ launcher.bind_(solver='compression/encodec_musicgen_32khz')
26
+ # replace this by the desired music dataset
27
+ launcher.bind_(dset='internal/music_400k_32khz')
28
+ # launch xp
29
+ launcher()
30
+ launcher({
31
+ 'metrics.visqol.bin': '/data/home/jadecopet/local/usr/opt/visqol',
32
+ 'label': 'visqol',
33
+ 'evaluate.metrics.visqol': True
34
+ })
audiocraft/grids/diffusion/4_bands_base_32khz.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Training of the 4 diffusion models described in
9
+ "From Discrete Tokens to High-Fidelity Audio Using Multi-Band Diffusion"
10
+ (paper link).
11
+ """
12
+
13
+ from ._explorers import DiffusionExplorer
14
+
15
+
16
+ @DiffusionExplorer
17
+ def explorer(launcher):
18
+ launcher.slurm_(gpus=4, partition='learnfair')
19
+
20
+ launcher.bind_({'solver': 'diffusion/default',
21
+ 'dset': 'internal/music_10k_32khz'})
22
+
23
+ with launcher.job_array():
24
+ launcher({'filter.use': True, 'filter.idx_band': 0, "processor.use": False, 'processor.power_std': 0.4})
25
+ launcher({'filter.use': True, 'filter.idx_band': 1, "processor.use": False, 'processor.power_std': 0.4})
26
+ launcher({'filter.use': True, 'filter.idx_band': 2, "processor.use": True, 'processor.power_std': 0.4})
27
+ launcher({'filter.use': True, 'filter.idx_band': 3, "processor.use": True, 'processor.power_std': 0.75})
audiocraft/grids/diffusion/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """Diffusion grids."""
audiocraft/grids/diffusion/_explorers.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import treetable as tt
8
+
9
+ from .._base_explorers import BaseExplorer
10
+
11
+
12
+ class DiffusionExplorer(BaseExplorer):
13
+ eval_metrics = ["sisnr", "visqol"]
14
+
15
+ def stages(self):
16
+ return ["train", "valid", "valid_ema", "evaluate", "evaluate_ema"]
17
+
18
+ def get_grid_meta(self):
19
+ """Returns the list of Meta information to display for each XP/job.
20
+ """
21
+ return [
22
+ tt.leaf("index", align=">"),
23
+ tt.leaf("name", wrap=140),
24
+ tt.leaf("state"),
25
+ tt.leaf("sig", align=">"),
26
+ ]
27
+
28
+ def get_grid_metrics(self):
29
+ """Return the metrics that should be displayed in the tracking table.
30
+ """
31
+ return [
32
+ tt.group(
33
+ "train",
34
+ [
35
+ tt.leaf("epoch"),
36
+ tt.leaf("loss", ".3%"),
37
+ ],
38
+ align=">",
39
+ ),
40
+ tt.group(
41
+ "valid",
42
+ [
43
+ tt.leaf("loss", ".3%"),
44
+ # tt.leaf("loss_0", ".3%"),
45
+ ],
46
+ align=">",
47
+ ),
48
+ tt.group(
49
+ "valid_ema",
50
+ [
51
+ tt.leaf("loss", ".3%"),
52
+ # tt.leaf("loss_0", ".3%"),
53
+ ],
54
+ align=">",
55
+ ),
56
+ tt.group(
57
+ "evaluate", [tt.leaf("rvm", ".4f"), tt.leaf("rvm_0", ".4f"),
58
+ tt.leaf("rvm_1", ".4f"), tt.leaf("rvm_2", ".4f"),
59
+ tt.leaf("rvm_3", ".4f"), ], align=">"
60
+ ),
61
+ tt.group(
62
+ "evaluate_ema", [tt.leaf("rvm", ".4f"), tt.leaf("rvm_0", ".4f"),
63
+ tt.leaf("rvm_1", ".4f"), tt.leaf("rvm_2", ".4f"),
64
+ tt.leaf("rvm_3", ".4f")], align=">"
65
+ ),
66
+ ]
audiocraft/grids/musicgen/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ """MusicGen grids."""
audiocraft/grids/musicgen/_explorers.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import typing as tp
8
+
9
+ import treetable as tt
10
+
11
+ from .._base_explorers import BaseExplorer
12
+
13
+
14
+ class LMExplorer(BaseExplorer):
15
+ eval_metrics: tp.List[str] = []
16
+
17
+ def stages(self) -> tp.List[str]:
18
+ return ['train', 'valid']
19
+
20
+ def get_grid_metrics(self):
21
+ """Return the metrics that should be displayed in the tracking table."""
22
+ return [
23
+ tt.group(
24
+ 'train',
25
+ [
26
+ tt.leaf('epoch'),
27
+ tt.leaf('duration', '.1f'), # duration in minutes
28
+ tt.leaf('ping'),
29
+ tt.leaf('ce', '.4f'), # cross entropy
30
+ tt.leaf("ppl", '.3f'), # perplexity
31
+ ],
32
+ align='>',
33
+ ),
34
+ tt.group(
35
+ 'valid',
36
+ [
37
+ tt.leaf('ce', '.4f'),
38
+ tt.leaf('ppl', '.3f'),
39
+ tt.leaf('best_ppl', '.3f'),
40
+ ],
41
+ align='>',
42
+ ),
43
+ ]
44
+
45
+ def process_sheep(self, sheep, history):
46
+ parts = super().process_sheep(sheep, history)
47
+
48
+ track_by = {'ppl': 'lower'} # values should be in ['lower', 'higher']
49
+ best_metrics = {k: (1 if v == 'lower' else -1) * float('inf') for k, v in track_by.items()}
50
+
51
+ def comparator(mode, a, b):
52
+ return a < b if mode == 'lower' else a > b
53
+
54
+ for metrics in history:
55
+ for key, sub in metrics.items():
56
+ for metric in track_by:
57
+ # for the validation set, keep track of best metrics (ppl in this example)
58
+ # this is so we can conveniently compare metrics between runs in the grid
59
+ if key == 'valid' and metric in sub and comparator(
60
+ track_by[metric], sub[metric], best_metrics[metric]
61
+ ):
62
+ best_metrics[metric] = sub[metric]
63
+
64
+ if 'valid' in parts:
65
+ parts['valid'].update({f'best_{k}': v for k, v in best_metrics.items()})
66
+ return parts
67
+
68
+
69
+ class GenerationEvalExplorer(BaseExplorer):
70
+ eval_metrics: tp.List[str] = []
71
+
72
+ def stages(self) -> tp.List[str]:
73
+ return ['evaluate']
74
+
75
+ def get_grid_metrics(self):
76
+ """Return the metrics that should be displayed in the tracking table."""
77
+ return [
78
+ tt.group(
79
+ 'evaluate',
80
+ [
81
+ tt.leaf('epoch', '.3f'),
82
+ tt.leaf('duration', '.1f'),
83
+ tt.leaf('ping'),
84
+ tt.leaf('ce', '.4f'),
85
+ tt.leaf('ppl', '.3f'),
86
+ tt.leaf('fad', '.3f'),
87
+ tt.leaf('kld', '.3f'),
88
+ tt.leaf('text_consistency', '.3f'),
89
+ tt.leaf('chroma_cosine', '.3f'),
90
+ ],
91
+ align='>',
92
+ ),
93
+ ]
audiocraft/grids/musicgen/musicgen_base_32khz.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from ._explorers import LMExplorer
8
+ from ...environment import AudioCraftEnvironment
9
+
10
+
11
+ @LMExplorer
12
+ def explorer(launcher):
13
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
14
+ launcher.slurm_(gpus=32, partition=partitions)
15
+ launcher.bind_(solver='musicgen/musicgen_base_32khz')
16
+ # replace this by the desired music dataset
17
+ launcher.bind_(dset='internal/music_400k_32khz')
18
+
19
+ fsdp = {'autocast': False, 'fsdp.use': True}
20
+ medium = {'model/lm/model_scale': 'medium'}
21
+ large = {'model/lm/model_scale': 'large'}
22
+
23
+ cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
24
+ wd_low = {'conditioners.description.t5.word_dropout': 0.2}
25
+
26
+ adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
27
+
28
+ launcher.bind_(fsdp)
29
+
30
+ launcher.slurm_(gpus=32).bind_(label='32gpus')
31
+ with launcher.job_array():
32
+ sub = launcher.bind()
33
+ sub()
34
+
35
+ launcher.slurm_(gpus=64).bind_(label='64gpus')
36
+ with launcher.job_array():
37
+ sub = launcher.bind()
38
+ sub(medium, adam)
39
+
40
+ launcher.slurm_(gpus=96).bind_(label='96gpus')
41
+ with launcher.job_array():
42
+ sub = launcher.bind()
43
+ sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})
audiocraft/grids/musicgen/musicgen_base_cached_32khz.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from ._explorers import LMExplorer
8
+ from ...environment import AudioCraftEnvironment
9
+
10
+
11
+ @LMExplorer
12
+ def explorer(launcher):
13
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
14
+ launcher.slurm_(gpus=32, partition=partitions)
15
+ launcher.bind_(solver='musicgen/musicgen_base_32khz')
16
+ # replace this by the desired music dataset
17
+ launcher.bind_(dset='internal/music_400k_32khz')
18
+
19
+ fsdp = {'autocast': False, 'fsdp.use': True}
20
+ medium = {'model/lm/model_scale': 'medium'}
21
+ large = {'model/lm/model_scale': 'large'}
22
+
23
+ cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
24
+ wd_low = {'conditioners.description.t5.word_dropout': 0.2}
25
+
26
+ adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
27
+
28
+ # BEGINNING OF CACHE WRITING JOBS.
29
+ cache_write = {
30
+ 'cache.path': '/fsx-codegen/defossez/cache/interleave_stereo_nv_32k',
31
+ 'cache.write': True,
32
+ 'generate.every': 500,
33
+ 'evaluate.every': 500,
34
+ 'logging.log_updates': 50,
35
+ }
36
+
37
+ cache_sub = launcher.bind({'model/lm/model_scale': 'xsmall', 'conditioner': 'none'})
38
+ cache_sub.bind_({'deadlock.use': True})
39
+ cache_sub.slurm_(gpus=8)
40
+ with launcher.job_array():
41
+ num_shards = 10 # total number of jobs running in parallel.
42
+ for shard in range(0, num_shards):
43
+ launcher(cache_write, {'cache.write_num_shards': num_shards, 'cache.write_shard': shard})
44
+
45
+ # REMOVE THE FOLLOWING RETURN STATEMENT ONCE THE ABOVE JOBS ARE DONE,
46
+ # OR SUFFICIENTLY AHEAD.
47
+ return
48
+
49
+ cache = {
50
+ 'cache.path': '/fsx-codegen/defossez/cache/interleave_stereo_nv_32k',
51
+ }
52
+ launcher.bind_(fsdp, cache)
53
+
54
+ launcher.slurm_(gpus=32).bind_(label='32gpus')
55
+ with launcher.job_array():
56
+ sub = launcher.bind()
57
+ sub()
58
+
59
+ launcher.slurm_(gpus=64).bind_(label='64gpus')
60
+ with launcher.job_array():
61
+ sub = launcher.bind()
62
+ sub(medium, adam)
63
+
64
+ launcher.slurm_(gpus=96).bind_(label='96gpus')
65
+ with launcher.job_array():
66
+ sub = launcher.bind()
67
+ sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})
audiocraft/grids/musicgen/musicgen_clapemb_32khz.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from ._explorers import LMExplorer
8
+ from ...environment import AudioCraftEnvironment
9
+
10
+
11
+ @LMExplorer
12
+ def explorer(launcher):
13
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
14
+ launcher.slurm_(gpus=32, partition=partitions)
15
+ launcher.bind_(solver='musicgen/musicgen_base_32khz')
16
+ # replace this by the desired music dataset
17
+ launcher.bind_(dset='internal/music_400k_32khz')
18
+ launcher.bind_(conditioner='clapemb2music')
19
+
20
+ fsdp = {'autocast': False, 'fsdp.use': True}
21
+ cache_path = {'conditioners.description.clap.cache_path':
22
+ '/fsx-audio-craft-llm/jadecopet/experiments/audiocraft/caches/clap_embed_music'}
23
+ text_wav_training_opt = {'conditioners.description.clap.text_p': 0.5}
24
+
25
+ launcher.bind_(fsdp)
26
+
27
+ launcher.slurm_(gpus=32).bind_(label='32gpus')
28
+ with launcher.job_array():
29
+ launcher()
30
+ launcher(text_wav_training_opt)
31
+ launcher(cache_path)
32
+ launcher(cache_path, text_wav_training_opt)
audiocraft/grids/musicgen/musicgen_melody_32khz.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from ._explorers import LMExplorer
8
+ from ...environment import AudioCraftEnvironment
9
+
10
+
11
+ @LMExplorer
12
+ def explorer(launcher):
13
+ partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
14
+ launcher.slurm_(gpus=32, partition=partitions)
15
+ launcher.bind_(solver='musicgen/musicgen_melody_32khz')
16
+ # replace this by the desired music dataset
17
+ launcher.bind_(dset='internal/music_400k_32khz')
18
+
19
+ fsdp = {'autocast': False, 'fsdp.use': True}
20
+ medium = {'model/lm/model_scale': 'medium'}
21
+ large = {'model/lm/model_scale': 'large'}
22
+
23
+ cfg_low = {'classifier_free_guidance.training_dropout': 0.2}
24
+ wd_low = {'conditioners.description.t5.word_dropout': 0.2}
25
+
26
+ adam = {'optim.optimizer': 'adamw', 'optim.lr': 1e-4}
27
+
28
+ cache_path = {'conditioners.self_wav.chroma_stem.cache_path':
29
+ '/fsx-audio-craft-llm/jadecopet/experiments/audiocraft/caches/chroma_stem'}
30
+
31
+ # CACHE GENERATION JOBS
32
+ n_cache_gen_jobs = 4
33
+ gen_sub = launcher.slurm(gpus=1)
34
+ gen_sub.bind_(
35
+ cache_path, {
36
+ # the cache is always computed over the whole file, so duration doesn't matter here.
37
+ 'dataset.segment_duration': 2.,
38
+ 'dataset.batch_size': 8,
39
+ 'dataset.train.permutation_on_files': True, # try to not repeat files.
40
+ 'optim.epochs': 10,
41
+ 'model/lm/model_scale': 'xsmall',
42
+
43
+ })
44
+ with gen_sub.job_array():
45
+ for gen_job in range(n_cache_gen_jobs):
46
+ gen_sub({'dataset.train.shuffle_seed': gen_job})
47
+
48
+ # ACTUAL TRAINING JOBS.
49
+ launcher.bind_(fsdp)
50
+
51
+ launcher.slurm_(gpus=32).bind_(label='32gpus')
52
+ with launcher.job_array():
53
+ sub = launcher.bind()
54
+ sub()
55
+ sub(cache_path)
56
+
57
+ launcher.slurm_(gpus=64).bind_(label='64gpus')
58
+ with launcher.job_array():
59
+ sub = launcher.bind()
60
+ sub(medium, adam)
61
+
62
+ launcher.slurm_(gpus=96).bind_(label='96gpus')
63
+ with launcher.job_array():
64
+ sub = launcher.bind()
65
+ sub(large, cfg_low, wd_low, adam, {'optim.max_norm': 3})