androstj commited on
Commit
f6f30f3
·
0 Parent(s):

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ # .idea/
163
+ .vscode/
164
+ .ruff_cache/
CHANGELOG.md ADDED
File without changes
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ This Code of Conduct also applies outside the project spaces when there is a
56
+ reasonable belief that an individual's behavior may have a negative impact on
57
+ the project or its community.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported by contacting the project team at <[email protected]>. All
63
+ complaints will be reviewed and investigated and will result in a response that
64
+ is deemed necessary and appropriate to the circumstances. The project team is
65
+ obligated to maintain confidentiality with regard to the reporter of an incident.
66
+ Further details of specific enforcement policies may be posted separately.
67
+
68
+ Project maintainers who do not follow or enforce the Code of Conduct in good
69
+ faith may face temporary or permanent repercussions as determined by other
70
+ members of the project's leadership.
71
+
72
+ ## Attribution
73
+
74
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76
+
77
+ [homepage]: https://www.contributor-covenant.org
78
+
79
+ For answers to common questions about this code of conduct, see
80
+ https://www.contributor-covenant.org/faq
CONTRIBUTING.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to audiobox-aesthetics
2
+ We want to make contributing to this project as easy and transparent as
3
+ possible.
4
+
5
+ ## Pull Requests
6
+ We actively welcome your pull requests.
7
+
8
+ 1. Fork the repo and create your branch from `main`.
9
+ 2. If you've added code that should be tested, add tests.
10
+ 3. If you've changed APIs, update the documentation.
11
+ 4. Ensure the test suite passes.
12
+ 5. Make sure your code lints.
13
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14
+
15
+ ## Contributor License Agreement ("CLA")
16
+ In order to accept your pull request, we need you to submit a CLA. You only need
17
+ to do this once to work on any of Meta's open source projects.
18
+
19
+ Complete your CLA here: <https://code.facebook.com/cla>
20
+
21
+ ## Issues
22
+ We use GitHub issues to track public bugs. Please ensure your description is
23
+ clear and has sufficient instructions to be able to reproduce the issue.
24
+
25
+ Meta has a [bounty program](https://bugbounty.meta.com/) for the safe
26
+ disclosure of security bugs. In those cases, please go through the process
27
+ outlined on that page and do not file a public issue.
28
+
29
+ ## License
30
+ By contributing to audiobox-aesthetics, you agree that your contributions will be licensed
31
+ under the LICENSE file in the root directory of this source tree.
LICENSE ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution 4.0 International Public License
58
+
59
+ By exercising the Licensed Rights (defined below), You accept and agree
60
+ to be bound by the terms and conditions of this Creative Commons
61
+ Attribution 4.0 International Public License ("Public License"). To the
62
+ extent this Public License may be interpreted as a contract, You are
63
+ granted the Licensed Rights in consideration of Your acceptance of
64
+ these terms and conditions, and the Licensor grants You such rights in
65
+ consideration of benefits the Licensor receives from making the
66
+ Licensed Material available under these terms and conditions.
67
+
68
+
69
+ Section 1 -- Definitions.
70
+
71
+ a. Adapted Material means material subject to Copyright and Similar
72
+ Rights that is derived from or based upon the Licensed Material
73
+ and in which the Licensed Material is translated, altered,
74
+ arranged, transformed, or otherwise modified in a manner requiring
75
+ permission under the Copyright and Similar Rights held by the
76
+ Licensor. For purposes of this Public License, where the Licensed
77
+ Material is a musical work, performance, or sound recording,
78
+ Adapted Material is always produced where the Licensed Material is
79
+ synched in timed relation with a moving image.
80
+
81
+ b. Adapter's License means the license You apply to Your Copyright
82
+ and Similar Rights in Your contributions to Adapted Material in
83
+ accordance with the terms and conditions of this Public License.
84
+
85
+ c. Copyright and Similar Rights means copyright and/or similar rights
86
+ closely related to copyright including, without limitation,
87
+ performance, broadcast, sound recording, and Sui Generis Database
88
+ Rights, without regard to how the rights are labeled or
89
+ categorized. For purposes of this Public License, the rights
90
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
91
+ Rights.
92
+
93
+ d. Effective Technological Measures means those measures that, in the
94
+ absence of proper authority, may not be circumvented under laws
95
+ fulfilling obligations under Article 11 of the WIPO Copyright
96
+ Treaty adopted on December 20, 1996, and/or similar international
97
+ agreements.
98
+
99
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
100
+ any other exception or limitation to Copyright and Similar Rights
101
+ that applies to Your use of the Licensed Material.
102
+
103
+ f. Licensed Material means the artistic or literary work, database,
104
+ or other material to which the Licensor applied this Public
105
+ License.
106
+
107
+ g. Licensed Rights means the rights granted to You subject to the
108
+ terms and conditions of this Public License, which are limited to
109
+ all Copyright and Similar Rights that apply to Your use of the
110
+ Licensed Material and that the Licensor has authority to license.
111
+
112
+ h. Licensor means the individual(s) or entity(ies) granting rights
113
+ under this Public License.
114
+
115
+ i. Share means to provide material to the public by any means or
116
+ process that requires permission under the Licensed Rights, such
117
+ as reproduction, public display, public performance, distribution,
118
+ dissemination, communication, or importation, and to make material
119
+ available to the public including in ways that members of the
120
+ public may access the material from a place and at a time
121
+ individually chosen by them.
122
+
123
+ j. Sui Generis Database Rights means rights other than copyright
124
+ resulting from Directive 96/9/EC of the European Parliament and of
125
+ the Council of 11 March 1996 on the legal protection of databases,
126
+ as amended and/or succeeded, as well as other essentially
127
+ equivalent rights anywhere in the world.
128
+
129
+ k. You means the individual or entity exercising the Licensed Rights
130
+ under this Public License. Your has a corresponding meaning.
131
+
132
+
133
+ Section 2 -- Scope.
134
+
135
+ a. License grant.
136
+
137
+ 1. Subject to the terms and conditions of this Public License,
138
+ the Licensor hereby grants You a worldwide, royalty-free,
139
+ non-sublicensable, non-exclusive, irrevocable license to
140
+ exercise the Licensed Rights in the Licensed Material to:
141
+
142
+ a. reproduce and Share the Licensed Material, in whole or
143
+ in part; and
144
+
145
+ b. produce, reproduce, and Share Adapted Material.
146
+
147
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
148
+ Exceptions and Limitations apply to Your use, this Public
149
+ License does not apply, and You do not need to comply with
150
+ its terms and conditions.
151
+
152
+ 3. Term. The term of this Public License is specified in Section
153
+ 6(a).
154
+
155
+ 4. Media and formats; technical modifications allowed. The
156
+ Licensor authorizes You to exercise the Licensed Rights in
157
+ all media and formats whether now known or hereafter created,
158
+ and to make technical modifications necessary to do so. The
159
+ Licensor waives and/or agrees not to assert any right or
160
+ authority to forbid You from making technical modifications
161
+ necessary to exercise the Licensed Rights, including
162
+ technical modifications necessary to circumvent Effective
163
+ Technological Measures. For purposes of this Public License,
164
+ simply making modifications authorized by this Section 2(a)
165
+ (4) never produces Adapted Material.
166
+
167
+ 5. Downstream recipients.
168
+
169
+ a. Offer from the Licensor -- Licensed Material. Every
170
+ recipient of the Licensed Material automatically
171
+ receives an offer from the Licensor to exercise the
172
+ Licensed Rights under the terms and conditions of this
173
+ Public License.
174
+
175
+ b. No downstream restrictions. You may not offer or impose
176
+ any additional or different terms or conditions on, or
177
+ apply any Effective Technological Measures to, the
178
+ Licensed Material if doing so restricts exercise of the
179
+ Licensed Rights by any recipient of the Licensed
180
+ Material.
181
+
182
+ 6. No endorsement. Nothing in this Public License constitutes or
183
+ may be construed as permission to assert or imply that You
184
+ are, or that Your use of the Licensed Material is, connected
185
+ with, or sponsored, endorsed, or granted official status by,
186
+ the Licensor or others designated to receive attribution as
187
+ provided in Section 3(a)(1)(A)(i).
188
+
189
+ b. Other rights.
190
+
191
+ 1. Moral rights, such as the right of integrity, are not
192
+ licensed under this Public License, nor are publicity,
193
+ privacy, and/or other similar personality rights; however, to
194
+ the extent possible, the Licensor waives and/or agrees not to
195
+ assert any such rights held by the Licensor to the limited
196
+ extent necessary to allow You to exercise the Licensed
197
+ Rights, but not otherwise.
198
+
199
+ 2. Patent and trademark rights are not licensed under this
200
+ Public License.
201
+
202
+ 3. To the extent possible, the Licensor waives any right to
203
+ collect royalties from You for the exercise of the Licensed
204
+ Rights, whether directly or through a collecting society
205
+ under any voluntary or waivable statutory or compulsory
206
+ licensing scheme. In all other cases the Licensor expressly
207
+ reserves any right to collect such royalties.
208
+
209
+
210
+ Section 3 -- License Conditions.
211
+
212
+ Your exercise of the Licensed Rights is expressly made subject to the
213
+ following conditions.
214
+
215
+ a. Attribution.
216
+
217
+ 1. If You Share the Licensed Material (including in modified
218
+ form), You must:
219
+
220
+ a. retain the following if it is supplied by the Licensor
221
+ with the Licensed Material:
222
+
223
+ i. identification of the creator(s) of the Licensed
224
+ Material and any others designated to receive
225
+ attribution, in any reasonable manner requested by
226
+ the Licensor (including by pseudonym if
227
+ designated);
228
+
229
+ ii. a copyright notice;
230
+
231
+ iii. a notice that refers to this Public License;
232
+
233
+ iv. a notice that refers to the disclaimer of
234
+ warranties;
235
+
236
+ v. a URI or hyperlink to the Licensed Material to the
237
+ extent reasonably practicable;
238
+
239
+ b. indicate if You modified the Licensed Material and
240
+ retain an indication of any previous modifications; and
241
+
242
+ c. indicate the Licensed Material is licensed under this
243
+ Public License, and include the text of, or the URI or
244
+ hyperlink to, this Public License.
245
+
246
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
247
+ reasonable manner based on the medium, means, and context in
248
+ which You Share the Licensed Material. For example, it may be
249
+ reasonable to satisfy the conditions by providing a URI or
250
+ hyperlink to a resource that includes the required
251
+ information.
252
+
253
+ 3. If requested by the Licensor, You must remove any of the
254
+ information required by Section 3(a)(1)(A) to the extent
255
+ reasonably practicable.
256
+
257
+ 4. If You Share Adapted Material You produce, the Adapter's
258
+ License You apply must not prevent recipients of the Adapted
259
+ Material from complying with this Public License.
260
+
261
+
262
+ Section 4 -- Sui Generis Database Rights.
263
+
264
+ Where the Licensed Rights include Sui Generis Database Rights that
265
+ apply to Your use of the Licensed Material:
266
+
267
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268
+ to extract, reuse, reproduce, and Share all or a substantial
269
+ portion of the contents of the database;
270
+
271
+ b. if You include all or a substantial portion of the database
272
+ contents in a database in which You have Sui Generis Database
273
+ Rights, then the database in which You have Sui Generis Database
274
+ Rights (but not its individual contents) is Adapted Material; and
275
+
276
+ c. You must comply with the conditions in Section 3(a) if You Share
277
+ all or a substantial portion of the contents of the database.
278
+
279
+ For the avoidance of doubt, this Section 4 supplements and does not
280
+ replace Your obligations under this Public License where the Licensed
281
+ Rights include other Copyright and Similar Rights.
282
+
283
+
284
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285
+
286
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296
+
297
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306
+
307
+ c. The disclaimer of warranties and limitation of liability provided
308
+ above shall be interpreted in a manner that, to the extent
309
+ possible, most closely approximates an absolute disclaimer and
310
+ waiver of all liability.
311
+
312
+
313
+ Section 6 -- Term and Termination.
314
+
315
+ a. This Public License applies for the term of the Copyright and
316
+ Similar Rights licensed here. However, if You fail to comply with
317
+ this Public License, then Your rights under this Public License
318
+ terminate automatically.
319
+
320
+ b. Where Your right to use the Licensed Material has terminated under
321
+ Section 6(a), it reinstates:
322
+
323
+ 1. automatically as of the date the violation is cured, provided
324
+ it is cured within 30 days of Your discovery of the
325
+ violation; or
326
+
327
+ 2. upon express reinstatement by the Licensor.
328
+
329
+ For the avoidance of doubt, this Section 6(b) does not affect any
330
+ right the Licensor may have to seek remedies for Your violations
331
+ of this Public License.
332
+
333
+ c. For the avoidance of doubt, the Licensor may also offer the
334
+ Licensed Material under separate terms or conditions or stop
335
+ distributing the Licensed Material at any time; however, doing so
336
+ will not terminate this Public License.
337
+
338
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339
+ License.
340
+
341
+
342
+ Section 7 -- Other Terms and Conditions.
343
+
344
+ a. The Licensor shall not be bound by any additional or different
345
+ terms or conditions communicated by You unless expressly agreed.
346
+
347
+ b. Any arrangements, understandings, or agreements regarding the
348
+ Licensed Material not stated herein are separate from and
349
+ independent of the terms and conditions of this Public License.
350
+
351
+
352
+ Section 8 -- Interpretation.
353
+
354
+ a. For the avoidance of doubt, this Public License does not, and
355
+ shall not be interpreted to, reduce, limit, restrict, or impose
356
+ conditions on any use of the Licensed Material that could lawfully
357
+ be made without permission under this Public License.
358
+
359
+ b. To the extent possible, if any provision of this Public License is
360
+ deemed unenforceable, it shall be automatically reformed to the
361
+ minimum extent necessary to make it enforceable. If the provision
362
+ cannot be reformed, it shall be severed from this Public License
363
+ without affecting the enforceability of the remaining terms and
364
+ conditions.
365
+
366
+ c. No term or condition of this Public License will be waived and no
367
+ failure to comply consented to unless expressly agreed to by the
368
+ Licensor.
369
+
370
+ d. Nothing in this Public License constitutes or may be interpreted
371
+ as a limitation upon, or waiver of, any privileges and immunities
372
+ that apply to the Licensor or You, including from the legal
373
+ processes of any jurisdiction or authority.
374
+
375
+
376
+ =======================================================================
377
+
378
+ Creative Commons is not a party to its public
379
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
380
+ its public licenses to material it publishes and in those instances
381
+ will be considered the “Licensor.” The text of the Creative Commons
382
+ public licenses is dedicated to the public domain under the CC0 Public
383
+ Domain Dedication. Except for the limited purpose of indicating that
384
+ material is shared under a Creative Commons public license or as
385
+ otherwise permitted by the Creative Commons policies published at
386
+ creativecommons.org/policies, Creative Commons does not authorize the
387
+ use of the trademark "Creative Commons" or any other trademark or logo
388
+ of Creative Commons without its prior written consent including,
389
+ without limitation, in connection with any unauthorized modifications
390
+ to any of its public licenses or any other arrangements,
391
+ understandings, or agreements concerning use of licensed material. For
392
+ the avoidance of doubt, this paragraph does not form part of the
393
+ public licenses.
394
+
395
+ Creative Commons may be contacted at creativecommons.org.
README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # audiobox-aesthetics
2
+
3
+ Unified automatic quality assessment for speech, music, and sound.
4
+
5
+ [](Paper)
6
+
7
+ ## Installation
8
+
9
+ This repository requires Python 3.9 and Pytorch 2.2 or greater. To install, you can clone this repo and run:
10
+ ```
11
+ pip install -e .
12
+ ```
13
+
14
+ ## Pre-trained Models
15
+
16
+ Model | Link
17
+ |---|---|
18
+ All axes | [checkpoint.pt](https://dl.fbaipublicfiles.com/audiobox-aesthetics/checkpoint.pt)
19
+
20
+ ## Usage
21
+
22
+ How to run prediction:
23
+
24
+ 1. Create a jsonl files with the following format
25
+ ```
26
+ {"path":"/path/to/a.wav"}
27
+ {"path":"/path/to/b.wav"}
28
+ ...
29
+ {"path":"/path/to/z.wav"}
30
+ ```
31
+ or if you only want to predict aesthetic score from certain timestamp
32
+ ```
33
+ {"path":"/path/to/a.wav", "start_time":0, "end_time": 5}
34
+ {"path":"/path/to/b.wav", "start_time":3, "end_time": 10}
35
+ ```
36
+ and save it as `input.jsonl`
37
+
38
+ 2. Run following command
39
+ ```
40
+ audio-aes input.jsonl --ckpt "/path/to/checkpoint.pt" > output.jsonl
41
+ ```
42
+
43
+ 3. Output file will contains same number of rows as `input.jsonl`. Each rows contains 4 axes prediction with JSON-formatted dictionary. Check following table for more info:
44
+ Axes name | Full name
45
+ |---|---|
46
+ CE | Content Enjoyment
47
+ CU | Content Usefulness
48
+ PC | Production Complexity
49
+ PQ | Production Quality
50
+
51
+ Output line example:
52
+ ```
53
+ {"CE": 5.146, "CU": 5.779, "PC": 2.148, "PQ": 7.220}
54
+ ```
55
+
56
+
57
+
58
+ 4. (Extra) If you want to extract only one axis (i.e. CE), post-process the output file with following command using `jq` utility:
59
+
60
+ ```jq '.CE' output.jsonl > output-aes_ce.txt```
61
+
62
+
63
+
64
+ ## Evaluation dataset
65
+ We released our evaluation dataset consisted of 4 axes of aesthetic annotation scores.
66
+
67
+ Here, we show an example on how to read and re-map each annotation to the actual audio file.
68
+ ```
69
+ {
70
+ "data_path": "/your_path/LibriTTS/train-clean-100/1363/139304/1363_139304_000011_000000.wav",
71
+ "Production_Quality": [8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 8.0, 5.0, 8.0, 8.0],
72
+ "Production_Complexity": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
73
+ "Content_Enjoyment": [8.0, 6.0, 8.0, 5.0, 8.0, 8.0, 8.0, 6.0, 8.0, 6.0],
74
+ "Content_Usefulness": [8.0, 6.0, 8.0, 7.0, 8.0, 9.0, 8.0, 6.0, 10.0, 7.0]
75
+ }
76
+ ```
77
+ 1. Recognize the dataset name from data_path. In the example, it is LibriTTS.
78
+ 2. Replace "/your_path/" into your downloaded LibriTTS directory.
79
+ 3. Each axes contains 10 scores annotated by 10 different human annotators.
80
+
81
+ data_path | URL
82
+ |---|---|
83
+ LibriTTS | https://openslr.org/60/
84
+ cv-corpus-13.0-2023-03-09 | https://commonvoice.mozilla.org/en/datasets
85
+ EARS | https://sp-uhh.github.io/ears_dataset/
86
+ MUSDB18 | https://sigsep.github.io/datasets/musdb.html
87
+ musiccaps | https://www.kaggle.com/datasets/googleai/musiccaps
88
+ (audioset) unbalanced_train_segments | https://research.google.com/audioset/dataset/index.html
89
+ PAM | https://zenodo.org/records/10737388
90
+
91
+ ## License
92
+ The majority of audiobox-aesthetics is licensed under CC-BY 4.0, as found in the LICENSE file.
93
+ However, portions of the project are available under separate license terms: [https://github.com/microsoft/unilm](https://github.com/microsoft/unilm) is licensed under MIT license.
94
+
95
+ ## Citation
96
+ If you found this repository useful, please use the following BibTeX entry.
97
+
98
+ ```
99
+ @article{tjandra2025aes,
100
+ title={Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound},
101
+ author={Tjandra, Andros and Wu, Yi-Chiao and Guo, Baishan and Hoffman, John and Ellis, Brian and Vyas, Apoorv and Shi, Bowen and Chen, Sanyuan and Le, Matt and Zacharov, Nick and Wood, Carleigh and Lee, Ann and Hsu, Wei-ning},
102
+ publisher={Meta AI},
103
+ year={2025},
104
+ url={https://ai.meta.com/research/publications/meta-audiobox-aesthetics-unified-automatic-quality-assessment-for-speech-music-and-sound/}
105
+ }
106
+ ```
107
+
108
+ ## Acknowledgements
109
+
110
+ Part of model code are copied from [https://github.com/microsoft/unilm/tree/master/wavlm](WavLM).
111
+
evaluation_data/AES_PAM.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evaluation_data/AES_natural_music.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evaluation_data/AES_natural_sound.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
evaluation_data/AES_natural_speech.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "audiobox_aesthetics"
7
+ version = "0.0.1"
8
+ authors = [
9
+ {name="Andros Tjandra", email="[email protected]"},
10
+ {name="Yi-Chiao Wu"},
11
+ {name="Baishan Guo"},
12
+ {name="John Hoffman"},
13
+ {name="Brian Ellis"},
14
+ {name="Apoorv Vyas"},
15
+ {name="Bowen Shi"},
16
+ {name="Sanyuan Chen"},
17
+ {name="Matt Le"},
18
+ {name="Nick Zacharov"},
19
+ {name="Carleigh Wood"},
20
+ {name="Ann Lee"},
21
+ {name="Wei-ning Hsu"}
22
+ ]
23
+ maintainers = [
24
+ {name="Andros Tjandra", email="[email protected]"}
25
+ ]
26
+ description = "Unified automatic quality assessment for speech, music, and sound."
27
+ requires-python = ">=3.9"
28
+ classifiers = [
29
+ "Programming Language :: Python :: 3",
30
+ "Operating System :: OS Independent",
31
+ ]
32
+ readme = "README.md"
33
+ license = {file = "LICENSE"}
34
+
35
+ dependencies = [
36
+ "numpy",
37
+ "torch>=2.2.0",
38
+ "torchaudio",
39
+ "tqdm",
40
+ "iopath",
41
+ "submitit"
42
+ ]
43
+
44
+ [project.scripts]
45
+ audio-aes = "audiobox_aesthetics.cli:app"
46
+
47
+ [project.urls]
48
+ Homepage = "https://github.com/pypa/sampleproject"
49
+ Issues = "https://github.com/pypa/sampleproject/issues"
50
+
51
+
src/audiobox_aesthetics/__init__.py ADDED
File without changes
src/audiobox_aesthetics/cli.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ import argparse
8
+ from functools import partial
9
+ import itertools
10
+ from pathlib import Path
11
+
12
+ import submitit
13
+ from .infer import load_dataset, main_predict
14
+
15
+
16
+ def parse_args():
17
+ parser = argparse.ArgumentParser("CLI for audiobox-aesthetics inference")
18
+ parser.add_argument("input_file", type=str)
19
+ parser.add_argument("--ckpt", type=str, required=True)
20
+ parser.add_argument("--batch-size", type=int, default=100)
21
+ parser.add_argument(
22
+ "--remote", action="store_true", default=False, help="Set true to run via SLURM"
23
+ )
24
+
25
+ # remote == True
26
+ parser.add_argument(
27
+ "--job-dir", default="/tmp", type=str, help="Slurm job directory"
28
+ )
29
+ parser.add_argument(
30
+ "--partition", default="learn", type=str, help="Slurm partition"
31
+ )
32
+ parser.add_argument("--qos", default="", type=str, help="Slurm QOS")
33
+ parser.add_argument("--account", default="", type=str, help="Slurm account")
34
+ parser.add_argument("--comment", default="", type=str, help="Slurm job comment")
35
+ parser.add_argument(
36
+ "--constraint",
37
+ default="",
38
+ type=str,
39
+ help="Slurm constraint eg.: ampere80gb For using A100s or volta32gb for using V100s.",
40
+ )
41
+ parser.add_argument(
42
+ "--exclude",
43
+ default="",
44
+ type=str,
45
+ help="Exclude certain nodes from the slurm job.",
46
+ )
47
+ parser.add_argument(
48
+ "--array", default=100, type=int, help="Slurm max array parallelism"
49
+ )
50
+ parser.add_argument(
51
+ "--chunk", default=1000, type=int, help="chunk size per instance"
52
+ )
53
+ return parser.parse_args()
54
+
55
+
56
+ def app():
57
+ args = parse_args()
58
+
59
+ metadata = load_dataset(args.input_file, 0, 2**64)
60
+ fn_wrapped = partial(main_predict, batch_size=args.batch_size, ckpt=args.ckpt)
61
+
62
+ if args.remote:
63
+ # chunk metadata
64
+ chunksize = args.chunk
65
+ chunked = [
66
+ metadata[ii : ii + chunksize] for ii in range(0, len(metadata), chunksize)
67
+ ]
68
+
69
+ job_dir = Path(args.job_dir)
70
+ job_dir.mkdir(exist_ok=True)
71
+
72
+ executor = submitit.AutoExecutor(folder=f"{job_dir}/%A/")
73
+
74
+ kwargs = {}
75
+ if len(args.constraint):
76
+ kwargs["slurm_constraint"] = args.constraint
77
+ if args.comment:
78
+ kwargs["slurm_comment"] = args.comment
79
+ if args.qos:
80
+ kwargs["slurm_qos"] = args.qos
81
+ if args.account:
82
+ kwargs["slurm_account"] = args.account
83
+
84
+ # Set the parameters for the Slurm job
85
+ executor.update_parameters(
86
+ slurm_nodes=1,
87
+ slurm_gpus_per_node=1,
88
+ slurm_tasks_per_node=1,
89
+ slurm_cpus_per_task=10,
90
+ timeout_min=60 * 20, # max is 20 hours
91
+ slurm_array_parallelism=min(
92
+ len(chunked), args.array
93
+ ), # number of tasks in the array job
94
+ slurm_partition=args.partition,
95
+ slurm_exclude=args.exclude,
96
+ **kwargs,
97
+ )
98
+
99
+ jobs = executor.map_array(fn_wrapped, chunked)
100
+ outputs = [job.result() for job in jobs]
101
+
102
+ outputs = itertools.chain(*outputs)
103
+ else:
104
+ outputs = fn_wrapped(metadata)
105
+ print("\n".join(str(x) for x in outputs))
106
+
107
+
108
+ if __name__ == "__main__":
109
+ """
110
+ Example usage:
111
+ python cli.py input.jsonl --batch-size 100 --ckpt /path/to/ckpt > output.jsonl
112
+ """
113
+ app()
src/audiobox_aesthetics/infer.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from dataclasses import dataclass
8
+ import json
9
+ import re
10
+ import sys
11
+ from typing import Any, Dict, List
12
+ from tqdm import tqdm
13
+ from iopath import PathManager
14
+ import torch
15
+ import torchaudio
16
+ import torch.nn.functional as F
17
+
18
+ from .model.aes_wavlm import Normalize, WavlmAudioEncoderMultiOutput
19
+
20
+ Batch = Dict[str, Any]
21
+
22
+
23
+ def read_wav(meta):
24
+ path = meta["path"]
25
+
26
+ if "start_time" in meta:
27
+ start = meta["start_time"]
28
+ end = meta["end_time"]
29
+ sr = torchaudio.info(path).sample_rate
30
+ wav, _ = torchaudio.load(
31
+ path, frame_offset=start * sr, num_frames=(end - start) * sr
32
+ )
33
+ else:
34
+ wav, sr = torchaudio.load(path)
35
+
36
+ if wav.shape[0] > 1:
37
+ wav = wav.mean(0, keepdim=True)
38
+
39
+ return wav, sr
40
+
41
+
42
+ def make_inference_batch(
43
+ input_wavs: list,
44
+ hop_size=10,
45
+ window_size=10,
46
+ sample_rate=16000,
47
+ pad_zero=True,
48
+ ):
49
+ wavs = []
50
+ masks = []
51
+ weights = []
52
+ bids = []
53
+ offset = hop_size * sample_rate
54
+ winlen = window_size * sample_rate
55
+ for bid, wav in enumerate(input_wavs):
56
+ for ii in range(0, wav.shape[-1], offset):
57
+ wav_ii = wav[..., ii : ii + winlen]
58
+ wav_ii_len = wav_ii.shape[-1]
59
+ if wav_ii_len < winlen and pad_zero:
60
+ wav_ii = F.pad(wav_ii, (0, winlen - wav_ii_len))
61
+ mask_ii = torch.zeros_like(wav_ii, dtype=torch.bool)
62
+ mask_ii[:, 0:wav_ii_len] = True
63
+ wavs.append(wav_ii)
64
+ masks.append(mask_ii)
65
+ weights.append(wav_ii_len / winlen)
66
+ bids.append(bid)
67
+ return wavs, masks, weights, bids
68
+
69
+
70
+ AXES_NAME = ["CE", "CU", "PC", "PQ"]
71
+
72
+
73
+ @dataclass
74
+ class AesWavlmPredictorMultiOutput:
75
+ checkpoint_pth: str
76
+ precision: str = "bf16"
77
+ batch_size: int = 1
78
+ data_col: str = "path"
79
+ sample_rate: int = 16000 # const
80
+
81
+ def setup_model(self):
82
+ # This method gets called before inference starts
83
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
84
+ self.path_manager = PathManager()
85
+ print(f"Setting up Aesthetic model on {self.device}", file=sys.stderr)
86
+
87
+ with self.path_manager.open(self.checkpoint_pth, "rb") as fin:
88
+ ckpt = torch.load(fin, map_location=self.device)
89
+ state_dict = {
90
+ re.sub("^model.", "", k): v for (k, v) in ckpt["state_dict"].items()
91
+ }
92
+ model = WavlmAudioEncoderMultiOutput(
93
+ **{
94
+ k: ckpt["model_cfg"][k]
95
+ for k in [
96
+ "proj_num_layer",
97
+ "proj_ln",
98
+ "proj_act_fn",
99
+ "proj_dropout",
100
+ "nth_layer",
101
+ "use_weighted_layer_sum",
102
+ "precision",
103
+ "normalize_embed",
104
+ "output_dim",
105
+ ]
106
+ }
107
+ )
108
+ model.load_state_dict(state_dict)
109
+ model.to(self.device)
110
+ model.eval()
111
+
112
+ self.model = model
113
+ self.dtype = {
114
+ "16": torch.float16,
115
+ "bf16": torch.bfloat16,
116
+ }.get(self.precision)
117
+
118
+ self.target_transform = {
119
+ axis: Normalize(
120
+ mean=ckpt["target_transform"][axis]["mean"],
121
+ std=ckpt["target_transform"][axis]["std"],
122
+ )
123
+ for axis in AXES_NAME
124
+ }
125
+
126
+ def audio_resample_mono(self, data_list: List[Batch]) -> List:
127
+ wavs = []
128
+ for ii, item in enumerate(data_list):
129
+ if isinstance(item[self.data_col], str):
130
+ # wav, sr = torchaudio.load(item[self.data_col])
131
+ wav, sr = read_wav(item)
132
+ else:
133
+ wav = item[self.data_col]
134
+ sr = item["sample_rate"]
135
+
136
+ wav = torchaudio.functional.resample(
137
+ wav,
138
+ orig_freq=sr,
139
+ new_freq=self.sample_rate,
140
+ )
141
+ wav = wav.mean(dim=0, keepdim=True)
142
+ wavs.append(wav)
143
+ return wavs
144
+
145
+ def forward(self, batch):
146
+ with torch.inference_mode():
147
+ bsz = len(batch)
148
+ wavs = self.audio_resample_mono(batch)
149
+ wavs, masks, weights, bids = make_inference_batch(
150
+ wavs,
151
+ 10,
152
+ 10,
153
+ sample_rate=self.sample_rate,
154
+ )
155
+
156
+ # collate
157
+ wavs = torch.stack(wavs).to(self.device)
158
+ masks = torch.stack(masks).to(self.device)
159
+ weights = torch.tensor(weights).to(self.device)
160
+ bids = torch.tensor(bids).to(self.device)
161
+
162
+ assert wavs.shape[0] == masks.shape[0] == weights.shape[0] == bids.shape[0]
163
+ preds_all = self.model({"wav": wavs, "mask": masks})
164
+ all_result = {}
165
+ for axis in AXES_NAME:
166
+ preds = self.target_transform[axis].inverse(preds_all[axis])
167
+ weighted_preds = []
168
+ for bii in range(bsz):
169
+ weights_bii = weights[bids == bii]
170
+ weighted_preds.append(
171
+ (
172
+ (preds[bids == bii] * weights_bii).sum() / weights_bii.sum()
173
+ ).item()
174
+ )
175
+ all_result[axis] = weighted_preds
176
+ # re-arrenge result
177
+ all_rows = [
178
+ dict(zip(all_result.keys(), vv)) for vv in zip(*all_result.values())
179
+ ]
180
+ # convert to json str
181
+ all_rows = [json.dumps(x) for x in all_rows]
182
+ return all_rows
183
+
184
+
185
+ def load_dataset(path, start=None, end=None) -> List[Batch]:
186
+ metadata = []
187
+ with open(path) as fr:
188
+ for ii, fi in enumerate(fr):
189
+ if start <= ii < end:
190
+ fi = json.loads(fi)
191
+ metadata.append(fi)
192
+ return metadata
193
+
194
+
195
+ def main_predict(input_file, ckpt, batch_size=10):
196
+ predictor = AesWavlmPredictorMultiOutput(checkpoint_pth=ckpt, data_col="path")
197
+
198
+ predictor.setup_model()
199
+
200
+ # load file
201
+ if isinstance(input_file, str):
202
+ metadata = load_dataset(input_file, 0, 2**64)
203
+ else:
204
+ metadata = input_file
205
+
206
+ outputs = []
207
+ for ii in tqdm(range(0, len(metadata), batch_size)):
208
+ output = predictor.forward(metadata[ii : ii + batch_size])
209
+ outputs.extend(output)
210
+ assert len(outputs) == len(
211
+ metadata
212
+ ), f"Output {len(outputs)} != input {len(metadata)} length"
213
+
214
+ return outputs
src/audiobox_aesthetics/model/__init__.py ADDED
File without changes
src/audiobox_aesthetics/model/aes_wavlm.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from dataclasses import dataclass
8
+ import sys
9
+ from torch import nn
10
+ import torch
11
+
12
+ from .utils import create_mlp_block
13
+ from .wavlm import WavLM, WavLMConfig
14
+
15
+
16
+ DEFAULT_AUDIO_CFG = WavLMConfig(
17
+ {
18
+ "extractor_mode": "default",
19
+ "encoder_layers": 12,
20
+ "encoder_embed_dim": 768,
21
+ "encoder_ffn_embed_dim": 3072,
22
+ "encoder_attention_heads": 12,
23
+ "activation_fn": "gelu",
24
+ "dropout": 0.1,
25
+ "attention_dropout": 0.1,
26
+ "activation_dropout": 0.0,
27
+ "encoder_layerdrop": 0.05,
28
+ "dropout_input": 0.1,
29
+ "dropout_features": 0.1,
30
+ "layer_norm_first": False,
31
+ "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
32
+ "conv_bias": False,
33
+ "feature_grad_mult": 0.1,
34
+ "mask_length": 10,
35
+ "mask_prob": 0.8,
36
+ "mask_selection": "static",
37
+ "mask_other": 0.0,
38
+ "no_mask_overlap": False,
39
+ "mask_min_space": 1,
40
+ "mask_channel_length": 10,
41
+ "mask_channel_prob": 0.0,
42
+ "mask_channel_selection": "static",
43
+ "mask_channel_other": 0.0,
44
+ "no_mask_channel_overlap": False,
45
+ "mask_channel_min_space": 1,
46
+ "conv_pos": 128,
47
+ "conv_pos_groups": 16,
48
+ "relative_position_embedding": True,
49
+ "num_buckets": 320,
50
+ "max_distance": 800,
51
+ "gru_rel_pos": True,
52
+ "normalize": False,
53
+ }
54
+ )
55
+
56
+
57
+ @dataclass(eq=False)
58
+ class Normalize:
59
+ mean: float
60
+ std: float
61
+
62
+ def transform(self, x):
63
+ return (x - self.mean) / self.std
64
+
65
+ def inverse(self, x):
66
+ return x * self.std + self.mean
67
+
68
+
69
+ AXES_NAME = ["CE", "CU", "PC", "PQ"]
70
+
71
+
72
+ @dataclass(eq=False)
73
+ class WavlmAudioEncoderMultiOutput(nn.Module):
74
+ proj_num_layer: int = 1
75
+ proj_ln: bool = False
76
+ proj_act_fn: str = "gelu"
77
+ proj_dropout: float = 0
78
+ nth_layer: int = 13
79
+ use_weighted_layer_sum: bool = True
80
+ precision: str = "32"
81
+ normalize_embed: bool = True
82
+ output_dim: int = 1
83
+
84
+ def __post_init__(self):
85
+ super().__init__()
86
+ amodel_cfg = DEFAULT_AUDIO_CFG
87
+ self.wavlm_model = WavLM(amodel_cfg)
88
+ wavlm_out_dim = self.wavlm_model.cfg.encoder_embed_dim
89
+
90
+ self.axes_name = AXES_NAME
91
+ self.proj_layer = nn.ModuleDict(
92
+ {
93
+ x: nn.Sequential(
94
+ *create_mlp_block(
95
+ wavlm_out_dim,
96
+ self.output_dim,
97
+ self.proj_num_layer,
98
+ self.proj_act_fn,
99
+ self.proj_ln,
100
+ dropout=self.proj_dropout,
101
+ )
102
+ )
103
+ for x in self.axes_name
104
+ }
105
+ )
106
+ if self.use_weighted_layer_sum:
107
+ self.layer_weights = nn.ParameterDict(
108
+ {
109
+ x: torch.nn.Parameter(torch.ones(self.nth_layer) / (self.nth_layer))
110
+ for x in self.axes_name
111
+ }
112
+ )
113
+
114
+ precision_map = {
115
+ "64": torch.float64,
116
+ "32": torch.float32,
117
+ "16": torch.half,
118
+ "bf16": torch.bfloat16,
119
+ }
120
+ self.precision = precision_map[str(self.precision)]
121
+ self.enable_autocast = str(self.precision) in {"16", "bf16"}
122
+ print(
123
+ f"precision: {self.precision}, enable autocast: {self.enable_autocast}",
124
+ file=sys.stderr,
125
+ )
126
+
127
+ def forward(self, batch):
128
+ assert batch["wav"].ndim == 3
129
+
130
+ # frames: [B, C, T]
131
+ wav = batch["wav"].squeeze(1)
132
+
133
+ if "mask" in batch:
134
+ padding_mask = ~batch["mask"].squeeze(1)
135
+ else:
136
+ padding_mask = torch.zeros_like(wav, dtype=torch.bool)
137
+
138
+ with (
139
+ torch.amp.autocast(
140
+ device_type=wav.device.type,
141
+ dtype=self.precision,
142
+ enabled=self.enable_autocast,
143
+ ),
144
+ torch.no_grad(),
145
+ ):
146
+ if self.wavlm_model.cfg.normalize:
147
+ wav = torch.nn.functional.layer_norm(wav, wav.shape)
148
+ (_, all_outputs), embed_padding_mask = self.wavlm_model.extract_features(
149
+ source=wav,
150
+ padding_mask=padding_mask,
151
+ output_layer=self.nth_layer,
152
+ ret_layer_results=True,
153
+ )
154
+ all_outputs = torch.stack([gg[0] for gg in all_outputs], dim=-1) # T B C L
155
+ preds = {}
156
+ for name in self.axes_name:
157
+ if self.use_weighted_layer_sum:
158
+ norm_weights = torch.nn.functional.softmax(
159
+ self.layer_weights[name], dim=-1
160
+ ) # L
161
+ audio_embed = torch.einsum("tbcl,l->btc", all_outputs, norm_weights)
162
+ else:
163
+ audio_embed = all_outputs[-1][0].transpose(1, 0)
164
+
165
+ embed_mask = (
166
+ (~embed_padding_mask).unsqueeze(dim=-1).type_as(audio_embed)
167
+ )
168
+ audio_embed = (audio_embed * embed_mask).sum(dim=1) / embed_mask.sum(
169
+ dim=1
170
+ ).clamp(min=1)
171
+ if self.normalize_embed:
172
+ audio_embed = torch.nn.functional.normalize(audio_embed, dim=-1)
173
+
174
+ preds[name] = self.proj_layer[name](audio_embed).squeeze(-1)
175
+ return preds
src/audiobox_aesthetics/model/utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from torch import nn
8
+
9
+
10
+ def create_mlp_block(input_dim, output_dim, num_layer, act_fn, layer_norm, dropout=0):
11
+ proj_layer = []
12
+ for ii in range(num_layer):
13
+ if ii == num_layer - 1:
14
+ proj_layer.append(nn.Linear(input_dim, output_dim))
15
+ else:
16
+ proj_layer.append(nn.Linear(input_dim, input_dim))
17
+ if layer_norm:
18
+ proj_layer.append(nn.LayerNorm(normalized_shape=(input_dim)))
19
+ if act_fn == "gelu":
20
+ proj_layer.append(nn.GELU())
21
+ else:
22
+ raise ValueError()
23
+ if dropout != 0:
24
+ proj_layer.append(nn.Dropout(p=dropout))
25
+ return proj_layer
src/audiobox_aesthetics/model/wavlm.py ADDED
@@ -0,0 +1,1597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+
3
+ # Based on WavLM code
4
+ # --------------------------------------------------------
5
+ # WavLM: Large-Scale Self-Supervised Pre-training for Full Stack Speech Processing (https://arxiv.org/abs/2110.13900.pdf)
6
+ # Github source: https://github.com/microsoft/unilm/tree/master/wavlm
7
+ # Copyright (c) 2021 Microsoft
8
+ # Licensed under The MIT License [see LICENSE for details]
9
+ # Based on fairseq code bases
10
+ # https://github.com/pytorch/fairseq
11
+ # --------------------------------------------------------
12
+
13
+ import logging
14
+ import math
15
+
16
+ from typing import Dict, List, Optional, Tuple
17
+
18
+ import numpy as np
19
+
20
+ import warnings
21
+ import torch
22
+ import torch.nn as nn
23
+ import torch.nn.functional as F
24
+ from torch import Tensor
25
+ from torch.nn import LayerNorm, Parameter
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class TransposeLast(nn.Module):
32
+ def __init__(self, deconstruct_idx=None):
33
+ super().__init__()
34
+ self.deconstruct_idx = deconstruct_idx
35
+
36
+ def forward(self, x):
37
+ if self.deconstruct_idx is not None:
38
+ x = x[self.deconstruct_idx]
39
+ return x.transpose(-2, -1)
40
+
41
+
42
+ class Fp32LayerNorm(nn.LayerNorm):
43
+ def __init__(self, *args, **kwargs):
44
+ super().__init__(*args, **kwargs)
45
+
46
+ def forward(self, input):
47
+ output = F.layer_norm(
48
+ input.float(),
49
+ self.normalized_shape,
50
+ self.weight.float() if self.weight is not None else None,
51
+ self.bias.float() if self.bias is not None else None,
52
+ self.eps,
53
+ )
54
+ return output.type_as(input)
55
+
56
+
57
+ class Fp32GroupNorm(nn.GroupNorm):
58
+ def __init__(self, *args, **kwargs):
59
+ super().__init__(*args, **kwargs)
60
+
61
+ def forward(self, input):
62
+ output = F.group_norm(
63
+ input.float(),
64
+ self.num_groups,
65
+ self.weight.float() if self.weight is not None else None,
66
+ self.bias.float() if self.bias is not None else None,
67
+ self.eps,
68
+ )
69
+ return output.type_as(input)
70
+
71
+
72
+ class GradMultiply(torch.autograd.Function):
73
+ @staticmethod
74
+ def forward(ctx, x, scale):
75
+ ctx.scale = scale
76
+ res = x.new(x)
77
+ return res
78
+
79
+ @staticmethod
80
+ def backward(ctx, grad):
81
+ return grad * ctx.scale, None
82
+
83
+
84
+ class SamePad(nn.Module):
85
+ def __init__(self, kernel_size, causal=False):
86
+ super().__init__()
87
+ if causal:
88
+ self.remove = kernel_size - 1
89
+ else:
90
+ self.remove = 1 if kernel_size % 2 == 0 else 0
91
+
92
+ def forward(self, x):
93
+ if self.remove > 0:
94
+ x = x[:, :, : -self.remove]
95
+ return x
96
+
97
+
98
+ class Swish(nn.Module):
99
+ """Swish function"""
100
+
101
+ def __init__(self):
102
+ """Construct an MultiHeadedAttention object."""
103
+ super(Swish, self).__init__()
104
+ self.act = torch.nn.Sigmoid()
105
+
106
+ def forward(self, x):
107
+ return x * self.act(x)
108
+
109
+
110
+ class GLU_Linear(nn.Module):
111
+ def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
112
+ super(GLU_Linear, self).__init__()
113
+
114
+ self.glu_type = glu_type
115
+ self.output_dim = output_dim
116
+
117
+ if glu_type == "sigmoid":
118
+ self.glu_act = torch.nn.Sigmoid()
119
+ elif glu_type == "swish":
120
+ self.glu_act = Swish()
121
+ elif glu_type == "relu":
122
+ self.glu_act = torch.nn.ReLU()
123
+ elif glu_type == "gelu":
124
+ self.glu_act = torch.nn.GELU()
125
+
126
+ if bias_in_glu:
127
+ self.linear = nn.Linear(input_dim, output_dim * 2, True)
128
+ else:
129
+ self.linear = nn.Linear(input_dim, output_dim * 2, False)
130
+
131
+ def forward(self, x):
132
+ # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case
133
+ x = self.linear(x)
134
+
135
+ if self.glu_type == "bilinear":
136
+ x = (
137
+ x[:, :, 0 : self.output_dim]
138
+ * x[:, :, self.output_dim : self.output_dim * 2]
139
+ )
140
+ else:
141
+ x = x[:, :, 0 : self.output_dim] * self.glu_act(
142
+ x[:, :, self.output_dim : self.output_dim * 2]
143
+ )
144
+
145
+ return x
146
+
147
+
148
+ def gelu_accurate(x):
149
+ if not hasattr(gelu_accurate, "_a"):
150
+ gelu_accurate._a = math.sqrt(2 / math.pi)
151
+ return (
152
+ 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
153
+ )
154
+
155
+
156
+ def gelu(x: torch.Tensor) -> torch.Tensor:
157
+ return torch.nn.functional.gelu(x.float()).type_as(x)
158
+
159
+
160
+ def get_activation_fn(activation: str):
161
+ """Returns the activation function corresponding to `activation`"""
162
+
163
+ if activation == "relu":
164
+ return F.relu
165
+ elif activation == "gelu":
166
+ return gelu
167
+ elif activation == "gelu_fast":
168
+ warnings.warn("--activation-fn=gelu_fast has been renamed to gelu_accurate")
169
+ return gelu_accurate
170
+ elif activation == "gelu_accurate":
171
+ return gelu_accurate
172
+ elif activation == "tanh":
173
+ return torch.tanh
174
+ elif activation == "linear":
175
+ return lambda x: x
176
+ elif activation == "glu":
177
+ return lambda x: x
178
+ else:
179
+ raise RuntimeError("--activation-fn {} not supported".format(activation))
180
+
181
+
182
+ def init_bert_params(module):
183
+ """
184
+ Initialize the weights specific to the BERT Model.
185
+ This overrides the default initializations depending on the specified arguments.
186
+ 1. If normal_init_linear_weights is set then weights of linear
187
+ layer will be initialized using the normal distribution and
188
+ bais will be set to the specified value.
189
+ 2. If normal_init_embed_weights is set then weights of embedding
190
+ layer will be initialized using the normal distribution.
191
+ 3. If normal_init_proj_weights is set then weights of
192
+ in_project_weight for MultiHeadAttention initialized using
193
+ the normal distribution (to be validated).
194
+ """
195
+
196
+ def normal_(data):
197
+ # with FSDP, module params will be on CUDA, so we cast them back to CPU
198
+ # so that the RNG is consistent with and without FSDP
199
+ data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
200
+
201
+ if isinstance(module, nn.Linear):
202
+ normal_(module.weight.data)
203
+ if module.bias is not None:
204
+ module.bias.data.zero_()
205
+ if isinstance(module, nn.Embedding):
206
+ normal_(module.weight.data)
207
+ if module.padding_idx is not None:
208
+ module.weight.data[module.padding_idx].zero_()
209
+ if isinstance(module, MultiheadAttention):
210
+ normal_(module.q_proj.weight.data)
211
+ normal_(module.k_proj.weight.data)
212
+ normal_(module.v_proj.weight.data)
213
+
214
+
215
+ def quant_noise(module, p, block_size):
216
+ """
217
+ Wraps modules and applies quantization noise to the weights for
218
+ subsequent quantization with Iterative Product Quantization as
219
+ described in "Training with Quantization Noise for Extreme Model Compression"
220
+
221
+ Args:
222
+ - module: nn.Module
223
+ - p: amount of Quantization Noise
224
+ - block_size: size of the blocks for subsequent quantization with iPQ
225
+
226
+ Remarks:
227
+ - Module weights must have the right sizes wrt the block size
228
+ - Only Linear, Embedding and Conv2d modules are supported for the moment
229
+ - For more detail on how to quantize by blocks with convolutional weights,
230
+ see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
231
+ - We implement the simplest form of noise here as stated in the paper
232
+ which consists in randomly dropping blocks
233
+ """
234
+
235
+ # if no quantization noise, don't register hook
236
+ if p <= 0:
237
+ return module
238
+
239
+ # supported modules
240
+ assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
241
+
242
+ # test whether module.weight has the right sizes wrt block_size
243
+ is_conv = module.weight.ndim == 4
244
+
245
+ # 2D matrix
246
+ if not is_conv:
247
+ assert (
248
+ module.weight.size(1) % block_size == 0
249
+ ), "Input features must be a multiple of block sizes"
250
+
251
+ # 4D matrix
252
+ else:
253
+ # 1x1 convolutions
254
+ if module.kernel_size == (1, 1):
255
+ assert (
256
+ module.in_channels % block_size == 0
257
+ ), "Input channels must be a multiple of block sizes"
258
+ # regular convolutions
259
+ else:
260
+ k = module.kernel_size[0] * module.kernel_size[1]
261
+ assert k % block_size == 0, "Kernel size must be a multiple of block size"
262
+
263
+ def _forward_pre_hook(mod, input):
264
+ # no noise for evaluation
265
+ if mod.training:
266
+ if not is_conv:
267
+ # gather weight and sizes
268
+ weight = mod.weight
269
+ in_features = weight.size(1)
270
+ out_features = weight.size(0)
271
+
272
+ # split weight matrix into blocks and randomly drop selected blocks
273
+ mask = torch.zeros(
274
+ in_features // block_size * out_features, device=weight.device
275
+ )
276
+ mask.bernoulli_(p)
277
+ mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
278
+
279
+ else:
280
+ # gather weight and sizes
281
+ weight = mod.weight
282
+ in_channels = mod.in_channels
283
+ out_channels = mod.out_channels
284
+
285
+ # split weight matrix into blocks and randomly drop selected blocks
286
+ if mod.kernel_size == (1, 1):
287
+ mask = torch.zeros(
288
+ int(in_channels // block_size * out_channels),
289
+ device=weight.device,
290
+ )
291
+ mask.bernoulli_(p)
292
+ mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
293
+ else:
294
+ mask = torch.zeros(
295
+ weight.size(0), weight.size(1), device=weight.device
296
+ )
297
+ mask.bernoulli_(p)
298
+ mask = (
299
+ mask.unsqueeze(2)
300
+ .unsqueeze(3)
301
+ .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
302
+ )
303
+
304
+ # scale weights and apply mask
305
+ mask = mask.to(
306
+ torch.bool
307
+ ) # x.bool() is not currently supported in TorchScript
308
+ s = 1 / (1 - p)
309
+ mod.weight.data = s * weight.masked_fill(mask, 0)
310
+
311
+ module.register_forward_pre_hook(_forward_pre_hook)
312
+ return module
313
+
314
+
315
+ class MultiheadAttention(nn.Module):
316
+ """Multi-headed attention.
317
+
318
+ See "Attention Is All You Need" for more details.
319
+ """
320
+
321
+ def __init__(
322
+ self,
323
+ embed_dim,
324
+ num_heads,
325
+ kdim=None,
326
+ vdim=None,
327
+ dropout=0.0,
328
+ bias=True,
329
+ add_bias_kv=False,
330
+ add_zero_attn=False,
331
+ self_attention=False,
332
+ encoder_decoder_attention=False,
333
+ q_noise=0.0,
334
+ qn_block_size=8,
335
+ has_relative_attention_bias=False,
336
+ num_buckets=32,
337
+ max_distance=128,
338
+ gru_rel_pos=False,
339
+ rescale_init=False,
340
+ ):
341
+ super().__init__()
342
+ self.embed_dim = embed_dim
343
+ self.kdim = kdim if kdim is not None else embed_dim
344
+ self.vdim = vdim if vdim is not None else embed_dim
345
+ self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
346
+
347
+ self.num_heads = num_heads
348
+ self.dropout_module = nn.Dropout(dropout)
349
+
350
+ self.has_relative_attention_bias = has_relative_attention_bias
351
+ self.num_buckets = num_buckets
352
+ self.max_distance = max_distance
353
+ if self.has_relative_attention_bias:
354
+ self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
355
+
356
+ self.head_dim = embed_dim // num_heads
357
+ self.q_head_dim = self.head_dim
358
+ self.k_head_dim = self.head_dim
359
+ assert (
360
+ self.head_dim * num_heads == self.embed_dim
361
+ ), "embed_dim must be divisible by num_heads"
362
+ self.scaling = self.head_dim**-0.5
363
+
364
+ self.self_attention = self_attention
365
+ self.encoder_decoder_attention = encoder_decoder_attention
366
+
367
+ assert not self.self_attention or self.qkv_same_dim, (
368
+ "Self-attention requires query, key and " "value to be of the same size"
369
+ )
370
+
371
+ k_bias = True
372
+ if rescale_init:
373
+ k_bias = False
374
+
375
+ k_embed_dim = embed_dim
376
+ q_embed_dim = embed_dim
377
+
378
+ self.k_proj = quant_noise(
379
+ nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size
380
+ )
381
+ self.v_proj = quant_noise(
382
+ nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
383
+ )
384
+ self.q_proj = quant_noise(
385
+ nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size
386
+ )
387
+
388
+ self.out_proj = quant_noise(
389
+ nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
390
+ )
391
+
392
+ if add_bias_kv:
393
+ self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
394
+ self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
395
+ else:
396
+ self.bias_k = self.bias_v = None
397
+
398
+ self.add_zero_attn = add_zero_attn
399
+
400
+ self.gru_rel_pos = gru_rel_pos
401
+ if self.gru_rel_pos:
402
+ self.grep_linear = nn.Linear(self.q_head_dim, 8)
403
+ self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
404
+
405
+ self.reset_parameters()
406
+
407
+ def reset_parameters(self):
408
+ if self.qkv_same_dim:
409
+ # Empirically observed the convergence to be much better with
410
+ # the scaled initialization
411
+ nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
412
+ nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
413
+ nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
414
+ else:
415
+ nn.init.xavier_uniform_(self.k_proj.weight)
416
+ nn.init.xavier_uniform_(self.v_proj.weight)
417
+ nn.init.xavier_uniform_(self.q_proj.weight)
418
+
419
+ nn.init.xavier_uniform_(self.out_proj.weight)
420
+ if self.out_proj.bias is not None:
421
+ nn.init.constant_(self.out_proj.bias, 0.0)
422
+ if self.bias_k is not None:
423
+ nn.init.xavier_normal_(self.bias_k)
424
+ if self.bias_v is not None:
425
+ nn.init.xavier_normal_(self.bias_v)
426
+ if self.has_relative_attention_bias:
427
+ nn.init.xavier_normal_(self.relative_attention_bias.weight)
428
+
429
+ def _relative_positions_bucket(self, relative_positions, bidirectional=True):
430
+ num_buckets = self.num_buckets
431
+ max_distance = self.max_distance
432
+ relative_buckets = 0
433
+
434
+ if bidirectional:
435
+ num_buckets = num_buckets // 2
436
+ relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
437
+ relative_positions = torch.abs(relative_positions)
438
+ else:
439
+ relative_positions = -torch.min(
440
+ relative_positions, torch.zeros_like(relative_positions)
441
+ )
442
+
443
+ max_exact = num_buckets // 2
444
+ is_small = relative_positions < max_exact
445
+
446
+ relative_postion_if_large = max_exact + (
447
+ torch.log(relative_positions.float() / max_exact)
448
+ / math.log(max_distance / max_exact)
449
+ * (num_buckets - max_exact)
450
+ ).to(torch.long)
451
+ relative_postion_if_large = torch.min(
452
+ relative_postion_if_large,
453
+ torch.full_like(relative_postion_if_large, num_buckets - 1),
454
+ )
455
+
456
+ relative_buckets += torch.where(
457
+ is_small, relative_positions, relative_postion_if_large
458
+ )
459
+ return relative_buckets
460
+
461
+ def compute_bias(self, query_length, key_length):
462
+ context_position = torch.arange(query_length, dtype=torch.long)[:, None]
463
+ memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
464
+ relative_position = memory_position - context_position
465
+ relative_position_bucket = self._relative_positions_bucket(
466
+ relative_position, bidirectional=True
467
+ )
468
+ relative_position_bucket = relative_position_bucket.to(
469
+ self.relative_attention_bias.weight.device
470
+ )
471
+ values = self.relative_attention_bias(relative_position_bucket)
472
+ values = values.permute([2, 0, 1])
473
+ return values
474
+
475
+ def forward(
476
+ self,
477
+ query,
478
+ key: Optional[Tensor],
479
+ value: Optional[Tensor],
480
+ key_padding_mask: Optional[Tensor] = None,
481
+ incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
482
+ need_weights: bool = True,
483
+ static_kv: bool = False,
484
+ attn_mask: Optional[Tensor] = None,
485
+ before_softmax: bool = False,
486
+ need_head_weights: bool = False,
487
+ position_bias: Optional[Tensor] = None,
488
+ ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
489
+ """Input shape: Time x Batch x Channel
490
+
491
+ Args:
492
+ key_padding_mask (ByteTensor, optional): mask to exclude
493
+ keys that are pads, of shape `(batch, src_len)`, where
494
+ padding elements are indicated by 1s.
495
+ need_weights (bool, optional): return the attention weights,
496
+ averaged over heads (default: False).
497
+ attn_mask (ByteTensor, optional): typically used to
498
+ implement causal attention, where the mask prevents the
499
+ attention from looking forward in time (default: None).
500
+ before_softmax (bool, optional): return the raw attention
501
+ weights and values before the attention softmax.
502
+ need_head_weights (bool, optional): return the attention
503
+ weights for each head. Implies *need_weights*. Default:
504
+ return the average attention weights over all heads.
505
+ """
506
+ if need_head_weights:
507
+ need_weights = True
508
+
509
+ is_tpu = query.device.type == "xla"
510
+
511
+ tgt_len, bsz, embed_dim = query.size()
512
+ src_len = tgt_len
513
+ assert embed_dim == self.embed_dim
514
+ assert list(query.size()) == [tgt_len, bsz, embed_dim]
515
+ if key is not None:
516
+ src_len, key_bsz, _ = key.size()
517
+ if not torch.jit.is_scripting():
518
+ assert key_bsz == bsz
519
+ assert value is not None
520
+ assert src_len, bsz == value.shape[:2]
521
+
522
+ if self.has_relative_attention_bias and position_bias is None:
523
+ position_bias = self.compute_bias(tgt_len, src_len)
524
+ position_bias = (
525
+ position_bias.unsqueeze(0)
526
+ .repeat(bsz, 1, 1, 1)
527
+ .view(bsz * self.num_heads, tgt_len, src_len)
528
+ )
529
+
530
+ if (
531
+ not is_tpu # don't use PyTorch version on TPUs
532
+ and incremental_state is None
533
+ and not static_kv
534
+ # A workaround for quantization to work. Otherwise JIT compilation
535
+ # treats bias in linear module as method.
536
+ and not torch.jit.is_scripting()
537
+ and self.q_head_dim == self.head_dim
538
+ ):
539
+ assert key is not None and value is not None
540
+ assert attn_mask is None
541
+
542
+ attn_mask_rel_pos = None
543
+ if position_bias is not None:
544
+ attn_mask_rel_pos = position_bias
545
+ if self.gru_rel_pos:
546
+ query_layer = query.transpose(0, 1)
547
+ new_x_shape = query_layer.size()[:-1] + (self.num_heads, -1)
548
+ query_layer = query_layer.view(*new_x_shape)
549
+ query_layer = query_layer.permute(0, 2, 1, 3)
550
+ _B, _H, _L, __ = query_layer.size()
551
+
552
+ gate_a, gate_b = torch.sigmoid(
553
+ self.grep_linear(query_layer)
554
+ .view(_B, _H, _L, 2, 4)
555
+ .sum(-1, keepdim=False)
556
+ ).chunk(2, dim=-1)
557
+ gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
558
+ attn_mask_rel_pos = (
559
+ gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
560
+ )
561
+
562
+ attn_mask_rel_pos = attn_mask_rel_pos.view((-1, tgt_len, tgt_len))
563
+ k_proj_bias = self.k_proj.bias
564
+ if k_proj_bias is None:
565
+ k_proj_bias = torch.zeros_like(self.q_proj.bias)
566
+
567
+ x, attn = F.multi_head_attention_forward(
568
+ query,
569
+ key,
570
+ value,
571
+ self.embed_dim,
572
+ self.num_heads,
573
+ torch.empty([0]),
574
+ torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
575
+ self.bias_k,
576
+ self.bias_v,
577
+ self.add_zero_attn,
578
+ self.dropout_module.p,
579
+ self.out_proj.weight,
580
+ self.out_proj.bias,
581
+ self.training,
582
+ # self.training or self.dropout_module.apply_during_inference,
583
+ key_padding_mask,
584
+ need_weights,
585
+ attn_mask_rel_pos,
586
+ use_separate_proj_weight=True,
587
+ q_proj_weight=self.q_proj.weight,
588
+ k_proj_weight=self.k_proj.weight,
589
+ v_proj_weight=self.v_proj.weight,
590
+ )
591
+ return x, attn, position_bias
592
+
593
+ if incremental_state is not None:
594
+ saved_state = self._get_input_buffer(incremental_state)
595
+ if saved_state is not None and "prev_key" in saved_state:
596
+ # previous time steps are cached - no need to recompute
597
+ # key and value if they are static
598
+ if static_kv:
599
+ assert self.encoder_decoder_attention and not self.self_attention
600
+ key = value = None
601
+ else:
602
+ saved_state = None
603
+
604
+ if self.self_attention:
605
+ q = self.q_proj(query)
606
+ k = self.k_proj(query)
607
+ v = self.v_proj(query)
608
+ elif self.encoder_decoder_attention:
609
+ # encoder-decoder attention
610
+ q = self.q_proj(query)
611
+ if key is None:
612
+ assert value is None
613
+ k = v = None
614
+ else:
615
+ k = self.k_proj(key)
616
+ v = self.v_proj(key)
617
+
618
+ else:
619
+ assert key is not None and value is not None
620
+ q = self.q_proj(query)
621
+ k = self.k_proj(key)
622
+ v = self.v_proj(value)
623
+ q *= self.scaling
624
+
625
+ if self.bias_k is not None:
626
+ assert self.bias_v is not None
627
+ k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
628
+ v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
629
+ if attn_mask is not None:
630
+ attn_mask = torch.cat(
631
+ [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
632
+ )
633
+ if key_padding_mask is not None:
634
+ key_padding_mask = torch.cat(
635
+ [
636
+ key_padding_mask,
637
+ key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
638
+ ],
639
+ dim=1,
640
+ )
641
+
642
+ q = (
643
+ q.contiguous()
644
+ .view(tgt_len, bsz * self.num_heads, self.q_head_dim)
645
+ .transpose(0, 1)
646
+ )
647
+ if k is not None:
648
+ k = (
649
+ k.contiguous()
650
+ .view(-1, bsz * self.num_heads, self.k_head_dim)
651
+ .transpose(0, 1)
652
+ )
653
+ if v is not None:
654
+ v = (
655
+ v.contiguous()
656
+ .view(-1, bsz * self.num_heads, self.head_dim)
657
+ .transpose(0, 1)
658
+ )
659
+
660
+ if saved_state is not None:
661
+ # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
662
+ if "prev_key" in saved_state:
663
+ _prev_key = saved_state["prev_key"]
664
+ assert _prev_key is not None
665
+ prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
666
+ if static_kv:
667
+ k = prev_key
668
+ else:
669
+ assert k is not None
670
+ k = torch.cat([prev_key, k], dim=1)
671
+ src_len = k.size(1)
672
+ if "prev_value" in saved_state:
673
+ _prev_value = saved_state["prev_value"]
674
+ assert _prev_value is not None
675
+ prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
676
+ if static_kv:
677
+ v = prev_value
678
+ else:
679
+ assert v is not None
680
+ v = torch.cat([prev_value, v], dim=1)
681
+ prev_key_padding_mask: Optional[Tensor] = None
682
+ if "prev_key_padding_mask" in saved_state:
683
+ prev_key_padding_mask = saved_state["prev_key_padding_mask"]
684
+ assert k is not None and v is not None
685
+ key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
686
+ key_padding_mask=key_padding_mask,
687
+ prev_key_padding_mask=prev_key_padding_mask,
688
+ batch_size=bsz,
689
+ src_len=k.size(1),
690
+ static_kv=static_kv,
691
+ )
692
+
693
+ saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
694
+ saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
695
+ saved_state["prev_key_padding_mask"] = key_padding_mask
696
+ # In this branch incremental_state is never None
697
+ assert incremental_state is not None
698
+ incremental_state = self._set_input_buffer(incremental_state, saved_state)
699
+ assert k is not None
700
+ assert k.size(1) == src_len
701
+
702
+ # This is part of a workaround to get around fork/join parallelism
703
+ # not supporting Optional types.
704
+ if key_padding_mask is not None and key_padding_mask.dim() == 0:
705
+ key_padding_mask = None
706
+
707
+ if key_padding_mask is not None:
708
+ assert key_padding_mask.size(0) == bsz
709
+ assert key_padding_mask.size(1) == src_len
710
+
711
+ if self.add_zero_attn:
712
+ assert v is not None
713
+ src_len += 1
714
+ k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
715
+ v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
716
+ if attn_mask is not None:
717
+ attn_mask = torch.cat(
718
+ [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
719
+ )
720
+ if key_padding_mask is not None:
721
+ key_padding_mask = torch.cat(
722
+ [
723
+ key_padding_mask,
724
+ torch.zeros(key_padding_mask.size(0), 1).type_as(
725
+ key_padding_mask
726
+ ),
727
+ ],
728
+ dim=1,
729
+ )
730
+
731
+ attn_weights = torch.bmm(q, k.transpose(1, 2))
732
+ attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
733
+
734
+ assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
735
+
736
+ if attn_mask is not None:
737
+ attn_mask = attn_mask.unsqueeze(0)
738
+ attn_weights += attn_mask
739
+
740
+ if key_padding_mask is not None:
741
+ # don't attend to padding symbols
742
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
743
+ if not is_tpu:
744
+ attn_weights = attn_weights.masked_fill(
745
+ key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
746
+ float("-inf"),
747
+ )
748
+ else:
749
+ attn_weights = attn_weights.transpose(0, 2)
750
+ attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
751
+ attn_weights = attn_weights.transpose(0, 2)
752
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
753
+
754
+ if before_softmax:
755
+ return attn_weights, v, position_bias
756
+
757
+ if position_bias is not None:
758
+ if self.gru_rel_pos == 1:
759
+ query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim)
760
+ _B, _H, _L, __ = query_layer.size()
761
+ gate_a, gate_b = torch.sigmoid(
762
+ self.grep_linear(query_layer)
763
+ .view(_B, _H, _L, 2, 4)
764
+ .sum(-1, keepdim=False)
765
+ ).chunk(2, dim=-1)
766
+ gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
767
+ position_bias = (
768
+ gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
769
+ )
770
+
771
+ position_bias = position_bias.view(attn_weights.size())
772
+
773
+ attn_weights = attn_weights + position_bias
774
+
775
+ attn_weights_float = F.softmax(attn_weights, dim=-1)
776
+ attn_weights = attn_weights_float.type_as(attn_weights)
777
+ attn_probs = self.dropout_module(attn_weights)
778
+
779
+ assert v is not None
780
+ attn = torch.bmm(attn_probs, v)
781
+ assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
782
+ attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
783
+ attn = self.out_proj(attn)
784
+ attn_weights: Optional[Tensor] = None
785
+ if need_weights:
786
+ attn_weights = attn_weights_float.view(
787
+ bsz, self.num_heads, tgt_len, src_len
788
+ ).transpose(1, 0)
789
+ if not need_head_weights:
790
+ # average attention weights over heads
791
+ attn_weights = attn_weights.mean(dim=0)
792
+
793
+ return attn, attn_weights, position_bias
794
+
795
+ @staticmethod
796
+ def _append_prev_key_padding_mask(
797
+ key_padding_mask: Optional[Tensor],
798
+ prev_key_padding_mask: Optional[Tensor],
799
+ batch_size: int,
800
+ src_len: int,
801
+ static_kv: bool,
802
+ ) -> Optional[Tensor]:
803
+ # saved key padding masks have shape (bsz, seq_len)
804
+ if prev_key_padding_mask is not None and static_kv:
805
+ new_key_padding_mask = prev_key_padding_mask
806
+ elif prev_key_padding_mask is not None and key_padding_mask is not None:
807
+ new_key_padding_mask = torch.cat(
808
+ [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
809
+ )
810
+ # During incremental decoding, as the padding token enters and
811
+ # leaves the frame, there will be a time when prev or current
812
+ # is None
813
+ elif prev_key_padding_mask is not None:
814
+ if src_len > prev_key_padding_mask.size(1):
815
+ filler = torch.zeros(
816
+ (batch_size, src_len - prev_key_padding_mask.size(1)),
817
+ device=prev_key_padding_mask.device,
818
+ )
819
+ new_key_padding_mask = torch.cat(
820
+ [prev_key_padding_mask.float(), filler.float()], dim=1
821
+ )
822
+ else:
823
+ new_key_padding_mask = prev_key_padding_mask.float()
824
+ elif key_padding_mask is not None:
825
+ if src_len > key_padding_mask.size(1):
826
+ filler = torch.zeros(
827
+ (batch_size, src_len - key_padding_mask.size(1)),
828
+ device=key_padding_mask.device,
829
+ )
830
+ new_key_padding_mask = torch.cat(
831
+ [filler.float(), key_padding_mask.float()], dim=1
832
+ )
833
+ else:
834
+ new_key_padding_mask = key_padding_mask.float()
835
+ else:
836
+ new_key_padding_mask = prev_key_padding_mask
837
+ return new_key_padding_mask
838
+
839
+ def _get_input_buffer(
840
+ self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
841
+ ) -> Dict[str, Optional[Tensor]]:
842
+ result = self.get_incremental_state(incremental_state, "attn_state")
843
+ if result is not None:
844
+ return result
845
+ else:
846
+ empty_result: Dict[str, Optional[Tensor]] = {}
847
+ return empty_result
848
+
849
+ def _set_input_buffer(
850
+ self,
851
+ incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
852
+ buffer: Dict[str, Optional[Tensor]],
853
+ ):
854
+ return self.set_incremental_state(incremental_state, "attn_state", buffer)
855
+
856
+ def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
857
+ return attn_weights
858
+
859
+
860
+ def compute_mask_indices(
861
+ shape: Tuple[int, int],
862
+ padding_mask: Optional[torch.Tensor],
863
+ mask_prob: float,
864
+ mask_length: int,
865
+ mask_type: str = "static",
866
+ mask_other: float = 0.0,
867
+ min_masks: int = 0,
868
+ no_overlap: bool = False,
869
+ min_space: int = 0,
870
+ ) -> np.ndarray:
871
+ """
872
+ Computes random mask spans for a given shape
873
+
874
+ Args:
875
+ shape: the the shape for which to compute masks.
876
+ should be of size 2 where first element is batch size and 2nd is timesteps
877
+ padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
878
+ mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
879
+ number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
880
+ however due to overlaps, the actual number will be smaller (unless no_overlap is True)
881
+ mask_type: how to compute mask lengths
882
+ static = fixed size
883
+ uniform = sample from uniform distribution [mask_other, mask_length*2]
884
+ normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
885
+ poisson = sample from possion distribution with lambda = mask length
886
+ min_masks: minimum number of masked spans
887
+ no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
888
+ min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
889
+ """
890
+
891
+ bsz, all_sz = shape
892
+ mask = np.full((bsz, all_sz), False)
893
+
894
+ all_num_mask = int(
895
+ # add a random number for probabilistic rounding
896
+ mask_prob * all_sz / float(mask_length) + np.random.rand()
897
+ )
898
+
899
+ all_num_mask = max(min_masks, all_num_mask)
900
+
901
+ mask_idcs = []
902
+ for i in range(bsz):
903
+ if padding_mask is not None:
904
+ sz = all_sz - padding_mask[i].long().sum().item()
905
+ num_mask = int(
906
+ # add a random number for probabilistic rounding
907
+ mask_prob * sz / float(mask_length) + np.random.rand()
908
+ )
909
+ num_mask = max(min_masks, num_mask)
910
+ else:
911
+ sz = all_sz
912
+ num_mask = all_num_mask
913
+
914
+ if mask_type == "static":
915
+ lengths = np.full(num_mask, mask_length)
916
+ elif mask_type == "uniform":
917
+ lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
918
+ elif mask_type == "normal":
919
+ lengths = np.random.normal(mask_length, mask_other, size=num_mask)
920
+ lengths = [max(1, int(round(x))) for x in lengths]
921
+ elif mask_type == "poisson":
922
+ lengths = np.random.poisson(mask_length, size=num_mask)
923
+ lengths = [int(round(x)) for x in lengths]
924
+ else:
925
+ raise Exception("unknown mask selection " + mask_type)
926
+
927
+ if sum(lengths) == 0:
928
+ lengths[0] = min(mask_length, sz - 1)
929
+
930
+ if no_overlap:
931
+ mask_idc = []
932
+
933
+ def arrange(s, e, length, keep_length):
934
+ span_start = np.random.randint(s, e - length)
935
+ mask_idc.extend(span_start + i for i in range(length))
936
+
937
+ new_parts = []
938
+ if span_start - s - min_space >= keep_length:
939
+ new_parts.append((s, span_start - min_space + 1))
940
+ if e - span_start - keep_length - min_space > keep_length:
941
+ new_parts.append((span_start + length + min_space, e))
942
+ return new_parts
943
+
944
+ parts = [(0, sz)]
945
+ min_length = min(lengths)
946
+ for length in sorted(lengths, reverse=True):
947
+ lens = np.fromiter(
948
+ (e - s if e - s >= length + min_space else 0 for s, e in parts),
949
+ np.int,
950
+ )
951
+ l_sum = np.sum(lens)
952
+ if l_sum == 0:
953
+ break
954
+ probs = lens / np.sum(lens)
955
+ c = np.random.choice(len(parts), p=probs)
956
+ s, e = parts.pop(c)
957
+ parts.extend(arrange(s, e, length, min_length))
958
+ mask_idc = np.asarray(mask_idc)
959
+ else:
960
+ min_len = min(lengths)
961
+ if sz - min_len <= num_mask:
962
+ min_len = sz - num_mask - 1
963
+
964
+ mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
965
+
966
+ mask_idc = np.asarray(
967
+ [
968
+ mask_idc[j] + offset
969
+ for j in range(len(mask_idc))
970
+ for offset in range(lengths[j])
971
+ ]
972
+ )
973
+
974
+ mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
975
+
976
+ min_len = min([len(m) for m in mask_idcs])
977
+ for i, mask_idc in enumerate(mask_idcs):
978
+ if len(mask_idc) > min_len:
979
+ mask_idc = np.random.choice(mask_idc, min_len, replace=False)
980
+ mask[i, mask_idc] = True
981
+
982
+ return mask
983
+
984
+
985
+ class WavLMConfig:
986
+ def __init__(self, cfg=None):
987
+ self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
988
+ self.encoder_layers: int = 12 # num encoder layers in the transformer
989
+
990
+ self.encoder_embed_dim: int = 768 # encoder embedding dimension
991
+ self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
992
+ self.encoder_attention_heads: int = 12 # num encoder attention heads
993
+ self.activation_fn: str = "gelu" # activation function to use
994
+
995
+ self.layer_norm_first: bool = False # apply layernorm first in the transformer
996
+ self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
997
+ self.conv_bias: bool = False # include bias in conv encoder
998
+ self.feature_grad_mult: float = (
999
+ 1.0 # multiply feature extractor var grads by this
1000
+ )
1001
+
1002
+ self.normalize: bool = (
1003
+ False # normalize input to have 0 mean and unit variance during training
1004
+ )
1005
+
1006
+ # dropouts
1007
+ self.dropout: float = 0.1 # dropout probability for the transformer
1008
+ self.attention_dropout: float = 0.1 # dropout probability for attention weights
1009
+ self.activation_dropout: float = (
1010
+ 0.0 # dropout probability after activation in FFN
1011
+ )
1012
+ self.encoder_layerdrop: float = (
1013
+ 0.0 # probability of dropping a tarnsformer layer
1014
+ )
1015
+ self.dropout_input: float = (
1016
+ 0.0 # dropout to apply to the input (after feat extr)
1017
+ )
1018
+ self.dropout_features: float = (
1019
+ 0.0 # dropout to apply to the features (after feat extr)
1020
+ )
1021
+
1022
+ # masking
1023
+ self.mask_length: int = 10 # mask length
1024
+ self.mask_prob: float = 0.65 # probability of replacing a token with mask
1025
+ self.mask_selection: str = "static" # how to choose mask length
1026
+ self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
1027
+ self.no_mask_overlap: bool = False # whether to allow masks to overlap
1028
+ self.mask_min_space: int = (
1029
+ 1 # min space between spans (if no overlap is enabled)
1030
+ )
1031
+
1032
+ # channel masking
1033
+ self.mask_channel_length: int = 10 # length of the mask for features (channels)
1034
+ self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0
1035
+ self.mask_channel_selection: str = (
1036
+ "static" # how to choose mask length for channel masking
1037
+ )
1038
+ self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
1039
+ self.no_mask_channel_overlap: bool = (
1040
+ False # whether to allow channel masks to overlap
1041
+ )
1042
+ self.mask_channel_min_space: int = (
1043
+ 1 # min space between spans (if no overlap is enabled)
1044
+ )
1045
+
1046
+ # positional embeddings
1047
+ self.conv_pos: int = (
1048
+ 128 # number of filters for convolutional positional embeddings
1049
+ )
1050
+ self.conv_pos_groups: int = (
1051
+ 16 # number of groups for convolutional positional embedding
1052
+ )
1053
+
1054
+ # relative position embedding
1055
+ self.relative_position_embedding: bool = (
1056
+ False # apply relative position embedding
1057
+ )
1058
+ self.num_buckets: int = 320 # number of buckets for relative position embedding
1059
+ self.max_distance: int = (
1060
+ 1280 # maximum distance for relative position embedding
1061
+ )
1062
+ self.gru_rel_pos: bool = False # apply gated relative position embedding
1063
+
1064
+ if cfg is not None:
1065
+ self.update(cfg)
1066
+
1067
+ def update(self, cfg: dict):
1068
+ self.__dict__.update(cfg)
1069
+
1070
+
1071
+ class WavLM(nn.Module):
1072
+ def __init__(
1073
+ self,
1074
+ cfg: WavLMConfig,
1075
+ ) -> None:
1076
+ super().__init__()
1077
+ logger.info(f"WavLM Config: {cfg.__dict__}")
1078
+
1079
+ self.cfg = cfg
1080
+ feature_enc_layers = eval(cfg.conv_feature_layers)
1081
+ self.embed = feature_enc_layers[-1][0]
1082
+
1083
+ self.feature_extractor = ConvFeatureExtractionModel(
1084
+ conv_layers=feature_enc_layers,
1085
+ dropout=0.0,
1086
+ mode=cfg.extractor_mode,
1087
+ conv_bias=cfg.conv_bias,
1088
+ )
1089
+
1090
+ self.post_extract_proj = (
1091
+ nn.Linear(self.embed, cfg.encoder_embed_dim)
1092
+ if self.embed != cfg.encoder_embed_dim
1093
+ else None
1094
+ )
1095
+
1096
+ self.mask_prob = cfg.mask_prob
1097
+ self.mask_selection = cfg.mask_selection
1098
+ self.mask_other = cfg.mask_other
1099
+ self.mask_length = cfg.mask_length
1100
+ self.no_mask_overlap = cfg.no_mask_overlap
1101
+ self.mask_min_space = cfg.mask_min_space
1102
+
1103
+ self.mask_channel_prob = cfg.mask_channel_prob
1104
+ self.mask_channel_selection = cfg.mask_channel_selection
1105
+ self.mask_channel_other = cfg.mask_channel_other
1106
+ self.mask_channel_length = cfg.mask_channel_length
1107
+ self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
1108
+ self.mask_channel_min_space = cfg.mask_channel_min_space
1109
+
1110
+ self.dropout_input = nn.Dropout(cfg.dropout_input)
1111
+ self.dropout_features = nn.Dropout(cfg.dropout_features)
1112
+
1113
+ self.feature_grad_mult = cfg.feature_grad_mult
1114
+
1115
+ self.mask_emb = nn.Parameter(
1116
+ torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
1117
+ )
1118
+
1119
+ self.encoder = TransformerEncoder(cfg)
1120
+ self.layer_norm = LayerNorm(self.embed)
1121
+
1122
+ def apply_mask(self, x, padding_mask):
1123
+ B, T, C = x.shape
1124
+ if self.mask_prob > 0:
1125
+ mask_indices = compute_mask_indices(
1126
+ (B, T),
1127
+ padding_mask,
1128
+ self.mask_prob,
1129
+ self.mask_length,
1130
+ self.mask_selection,
1131
+ self.mask_other,
1132
+ min_masks=2,
1133
+ no_overlap=self.no_mask_overlap,
1134
+ min_space=self.mask_min_space,
1135
+ )
1136
+ mask_indices = torch.from_numpy(mask_indices).to(x.device)
1137
+ x[mask_indices] = self.mask_emb
1138
+ else:
1139
+ mask_indices = None
1140
+
1141
+ if self.mask_channel_prob > 0:
1142
+ mask_channel_indices = compute_mask_indices(
1143
+ (B, C),
1144
+ None,
1145
+ self.mask_channel_prob,
1146
+ self.mask_channel_length,
1147
+ self.mask_channel_selection,
1148
+ self.mask_channel_other,
1149
+ no_overlap=self.no_mask_channel_overlap,
1150
+ min_space=self.mask_channel_min_space,
1151
+ )
1152
+ mask_channel_indices = (
1153
+ torch.from_numpy(mask_channel_indices)
1154
+ .to(x.device)
1155
+ .unsqueeze(1)
1156
+ .expand(-1, T, -1)
1157
+ )
1158
+ x[mask_channel_indices] = 0
1159
+
1160
+ return x, mask_indices
1161
+
1162
+ def forward_padding_mask(
1163
+ self,
1164
+ features: torch.Tensor,
1165
+ padding_mask: torch.Tensor,
1166
+ ) -> torch.Tensor:
1167
+ extra = padding_mask.size(1) % features.size(1)
1168
+ if extra > 0:
1169
+ padding_mask = padding_mask[:, :-extra]
1170
+ padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
1171
+ padding_mask = padding_mask.all(-1)
1172
+ return padding_mask
1173
+
1174
+ def extract_features(
1175
+ self,
1176
+ source: torch.Tensor,
1177
+ padding_mask: Optional[torch.Tensor] = None,
1178
+ mask: bool = False,
1179
+ ret_conv: bool = False,
1180
+ output_layer: Optional[int] = None,
1181
+ ret_layer_results: bool = False,
1182
+ ):
1183
+ if self.feature_grad_mult > 0:
1184
+ features = self.feature_extractor(source)
1185
+ if self.feature_grad_mult != 1.0:
1186
+ features = GradMultiply.apply(features, self.feature_grad_mult)
1187
+ else:
1188
+ with torch.no_grad():
1189
+ features = self.feature_extractor(source)
1190
+
1191
+ features = features.transpose(1, 2)
1192
+ features = self.layer_norm(features)
1193
+
1194
+ if padding_mask is not None:
1195
+ padding_mask = self.forward_padding_mask(features, padding_mask)
1196
+
1197
+ if self.post_extract_proj is not None:
1198
+ features = self.post_extract_proj(features)
1199
+
1200
+ features = self.dropout_input(features)
1201
+
1202
+ if mask:
1203
+ x, mask_indices = self.apply_mask(features, padding_mask)
1204
+ else:
1205
+ x = features
1206
+
1207
+ # feature: (B, T, D), float
1208
+ # target: (B, T), long
1209
+ # x: (B, T, D), float
1210
+ # padding_mask: (B, T), bool
1211
+ # mask_indices: (B, T), bool
1212
+ x, layer_results = self.encoder(
1213
+ x,
1214
+ padding_mask=padding_mask,
1215
+ layer=None if output_layer is None else output_layer - 1,
1216
+ )
1217
+
1218
+ res = {
1219
+ "x": x,
1220
+ "padding_mask": padding_mask,
1221
+ "features": features,
1222
+ "layer_results": layer_results,
1223
+ }
1224
+
1225
+ feature = res["features"] if ret_conv else res["x"]
1226
+ if ret_layer_results:
1227
+ feature = (feature, res["layer_results"])
1228
+ return feature, res["padding_mask"]
1229
+
1230
+
1231
+ class ConvFeatureExtractionModel(nn.Module):
1232
+ def __init__(
1233
+ self,
1234
+ conv_layers: List[Tuple[int, int, int]],
1235
+ dropout: float = 0.0,
1236
+ mode: str = "default",
1237
+ conv_bias: bool = False,
1238
+ conv_type: str = "default",
1239
+ ):
1240
+ super().__init__()
1241
+
1242
+ assert mode in {"default", "layer_norm"}
1243
+
1244
+ def block(
1245
+ n_in,
1246
+ n_out,
1247
+ k,
1248
+ stride,
1249
+ is_layer_norm=False,
1250
+ is_group_norm=False,
1251
+ conv_bias=False,
1252
+ ):
1253
+ def make_conv():
1254
+ conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
1255
+ nn.init.kaiming_normal_(conv.weight)
1256
+ return conv
1257
+
1258
+ assert (
1259
+ is_layer_norm and is_group_norm
1260
+ ) is False, "layer norm and group norm are exclusive"
1261
+
1262
+ if is_layer_norm:
1263
+ return nn.Sequential(
1264
+ make_conv(),
1265
+ nn.Dropout(p=dropout),
1266
+ nn.Sequential(
1267
+ TransposeLast(),
1268
+ Fp32LayerNorm(dim, elementwise_affine=True),
1269
+ TransposeLast(),
1270
+ ),
1271
+ nn.GELU(),
1272
+ )
1273
+ elif is_group_norm:
1274
+ return nn.Sequential(
1275
+ make_conv(),
1276
+ nn.Dropout(p=dropout),
1277
+ Fp32GroupNorm(dim, dim, affine=True),
1278
+ nn.GELU(),
1279
+ )
1280
+ else:
1281
+ return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
1282
+
1283
+ self.conv_type = conv_type
1284
+ if self.conv_type == "default":
1285
+ in_d = 1
1286
+ self.conv_layers = nn.ModuleList()
1287
+ for i, cl in enumerate(conv_layers):
1288
+ assert len(cl) == 3, "invalid conv definition: " + str(cl)
1289
+ (dim, k, stride) = cl
1290
+
1291
+ self.conv_layers.append(
1292
+ block(
1293
+ in_d,
1294
+ dim,
1295
+ k,
1296
+ stride,
1297
+ is_layer_norm=mode == "layer_norm",
1298
+ is_group_norm=mode == "default" and i == 0,
1299
+ conv_bias=conv_bias,
1300
+ )
1301
+ )
1302
+ in_d = dim
1303
+ elif self.conv_type == "conv2d":
1304
+ in_d = 1
1305
+ self.conv_layers = nn.ModuleList()
1306
+ for i, cl in enumerate(conv_layers):
1307
+ assert len(cl) == 3
1308
+ (dim, k, stride) = cl
1309
+
1310
+ self.conv_layers.append(torch.nn.Conv2d(in_d, dim, k, stride))
1311
+ self.conv_layers.append(torch.nn.ReLU())
1312
+ in_d = dim
1313
+ elif self.conv_type == "custom":
1314
+ in_d = 1
1315
+ idim = 80
1316
+ self.conv_layers = nn.ModuleList()
1317
+ for i, cl in enumerate(conv_layers):
1318
+ assert len(cl) == 3
1319
+ (dim, k, stride) = cl
1320
+ self.conv_layers.append(
1321
+ torch.nn.Conv2d(in_d, dim, k, stride, padding=1)
1322
+ )
1323
+ self.conv_layers.append(torch.nn.LayerNorm([dim, idim]))
1324
+ self.conv_layers.append(torch.nn.ReLU())
1325
+ in_d = dim
1326
+ if (i + 1) % 2 == 0:
1327
+ self.conv_layers.append(
1328
+ torch.nn.MaxPool2d(2, stride=2, ceil_mode=True)
1329
+ )
1330
+ idim = int(math.ceil(idim / 2))
1331
+ else:
1332
+ pass
1333
+
1334
+ def forward(self, x, mask=None):
1335
+ # BxT -> BxCxT
1336
+ x = x.unsqueeze(1)
1337
+ if self.conv_type == "custom":
1338
+ for conv in self.conv_layers:
1339
+ if isinstance(conv, nn.LayerNorm):
1340
+ x = x.transpose(1, 2)
1341
+ x = conv(x).transpose(1, 2)
1342
+ else:
1343
+ x = conv(x)
1344
+ x = x.transpose(2, 3).contiguous()
1345
+ x = x.view(x.size(0), -1, x.size(-1))
1346
+ else:
1347
+ for conv in self.conv_layers:
1348
+ x = conv(x)
1349
+ if self.conv_type == "conv2d":
1350
+ b, c, t, f = x.size()
1351
+ x = x.transpose(2, 3).contiguous().view(b, c * f, t)
1352
+ return x
1353
+
1354
+
1355
+ class TransformerEncoder(nn.Module):
1356
+ def __init__(self, args):
1357
+ super().__init__()
1358
+
1359
+ self.dropout = args.dropout
1360
+ self.embedding_dim = args.encoder_embed_dim
1361
+
1362
+ self.pos_conv = nn.Conv1d(
1363
+ self.embedding_dim,
1364
+ self.embedding_dim,
1365
+ kernel_size=args.conv_pos,
1366
+ padding=args.conv_pos // 2,
1367
+ groups=args.conv_pos_groups,
1368
+ )
1369
+ dropout = 0
1370
+ std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
1371
+ nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
1372
+ nn.init.constant_(self.pos_conv.bias, 0)
1373
+
1374
+ self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
1375
+ self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
1376
+
1377
+ if hasattr(args, "relative_position_embedding"):
1378
+ self.relative_position_embedding = args.relative_position_embedding
1379
+ self.num_buckets = args.num_buckets
1380
+ self.max_distance = args.max_distance
1381
+ else:
1382
+ self.relative_position_embedding = False
1383
+ self.num_buckets = 0
1384
+ self.max_distance = 0
1385
+
1386
+ self.layers = nn.ModuleList(
1387
+ [
1388
+ TransformerSentenceEncoderLayer(
1389
+ embedding_dim=self.embedding_dim,
1390
+ ffn_embedding_dim=args.encoder_ffn_embed_dim,
1391
+ num_attention_heads=args.encoder_attention_heads,
1392
+ dropout=self.dropout,
1393
+ attention_dropout=args.attention_dropout,
1394
+ activation_dropout=args.activation_dropout,
1395
+ activation_fn=args.activation_fn,
1396
+ layer_norm_first=args.layer_norm_first,
1397
+ has_relative_attention_bias=(
1398
+ self.relative_position_embedding and i == 0
1399
+ ),
1400
+ num_buckets=self.num_buckets,
1401
+ max_distance=self.max_distance,
1402
+ gru_rel_pos=args.gru_rel_pos,
1403
+ )
1404
+ for i in range(args.encoder_layers)
1405
+ ]
1406
+ )
1407
+
1408
+ self.layer_norm_first = args.layer_norm_first
1409
+ self.layer_norm = LayerNorm(self.embedding_dim)
1410
+ self.layerdrop = args.encoder_layerdrop
1411
+
1412
+ self.apply(init_bert_params)
1413
+
1414
+ def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
1415
+ x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer)
1416
+
1417
+ if self.layer_norm_first and layer is None:
1418
+ x = self.layer_norm(x)
1419
+
1420
+ return x, layer_results
1421
+
1422
+ def extract_features(
1423
+ self, x, padding_mask=None, streaming_mask=None, tgt_layer=None
1424
+ ):
1425
+ if padding_mask is not None:
1426
+ x[padding_mask] = 0
1427
+
1428
+ x_conv = self.pos_conv(x.transpose(1, 2))
1429
+ x_conv = x_conv.transpose(1, 2)
1430
+ x = x + x_conv
1431
+
1432
+ if not self.layer_norm_first:
1433
+ x = self.layer_norm(x)
1434
+
1435
+ x = F.dropout(x, p=self.dropout, training=self.training)
1436
+
1437
+ # B x T x C -> T x B x C
1438
+ x = x.transpose(0, 1)
1439
+
1440
+ layer_results = []
1441
+ z = None
1442
+ if tgt_layer is not None:
1443
+ layer_results.append((x, z))
1444
+ r = None
1445
+ pos_bias = None
1446
+ for i, layer in enumerate(self.layers):
1447
+ dropout_probability = np.random.random()
1448
+ if not self.training or (dropout_probability > self.layerdrop):
1449
+ x, z, pos_bias = layer(
1450
+ x,
1451
+ self_attn_padding_mask=padding_mask,
1452
+ need_weights=False,
1453
+ self_attn_mask=streaming_mask,
1454
+ pos_bias=pos_bias,
1455
+ )
1456
+ if tgt_layer is not None:
1457
+ layer_results.append((x, z))
1458
+ if i == tgt_layer:
1459
+ r = x
1460
+ break
1461
+
1462
+ if r is not None:
1463
+ x = r
1464
+
1465
+ # T x B x C -> B x T x C
1466
+ x = x.transpose(0, 1)
1467
+
1468
+ return x, layer_results
1469
+
1470
+
1471
+ class TransformerSentenceEncoderLayer(nn.Module):
1472
+ """
1473
+ Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
1474
+ models.
1475
+ """
1476
+
1477
+ def __init__(
1478
+ self,
1479
+ embedding_dim: float = 768,
1480
+ ffn_embedding_dim: float = 3072,
1481
+ num_attention_heads: float = 8,
1482
+ dropout: float = 0.1,
1483
+ attention_dropout: float = 0.1,
1484
+ activation_dropout: float = 0.1,
1485
+ activation_fn: str = "relu",
1486
+ layer_norm_first: bool = False,
1487
+ has_relative_attention_bias: bool = False,
1488
+ num_buckets: int = 0,
1489
+ max_distance: int = 0,
1490
+ rescale_init: bool = False,
1491
+ gru_rel_pos: bool = False,
1492
+ ) -> None:
1493
+ super().__init__()
1494
+ # Initialize parameters
1495
+ self.embedding_dim = embedding_dim
1496
+ self.dropout = dropout
1497
+ self.activation_dropout = activation_dropout
1498
+
1499
+ # Initialize blocks
1500
+ self.activation_name = activation_fn
1501
+ self.activation_fn = get_activation_fn(activation_fn)
1502
+ self.self_attn = MultiheadAttention(
1503
+ self.embedding_dim,
1504
+ num_attention_heads,
1505
+ dropout=attention_dropout,
1506
+ self_attention=True,
1507
+ has_relative_attention_bias=has_relative_attention_bias,
1508
+ num_buckets=num_buckets,
1509
+ max_distance=max_distance,
1510
+ rescale_init=rescale_init,
1511
+ gru_rel_pos=gru_rel_pos,
1512
+ )
1513
+
1514
+ self.dropout1 = nn.Dropout(dropout)
1515
+ self.dropout2 = nn.Dropout(self.activation_dropout)
1516
+ self.dropout3 = nn.Dropout(dropout)
1517
+
1518
+ self.layer_norm_first = layer_norm_first
1519
+
1520
+ # layer norm associated with the self attention layer
1521
+ self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
1522
+
1523
+ if self.activation_name == "glu":
1524
+ self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
1525
+ else:
1526
+ self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
1527
+ self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
1528
+
1529
+ # layer norm associated with the position wise feed-forward NN
1530
+ self.final_layer_norm = LayerNorm(self.embedding_dim)
1531
+
1532
+ def forward(
1533
+ self,
1534
+ x: torch.Tensor,
1535
+ self_attn_mask: torch.Tensor = None,
1536
+ self_attn_padding_mask: torch.Tensor = None,
1537
+ need_weights: bool = False,
1538
+ pos_bias=None,
1539
+ ):
1540
+ """
1541
+ LayerNorm is applied either before or after the self-attention/ffn
1542
+ modules similar to the original Transformer imlementation.
1543
+ """
1544
+ residual = x
1545
+
1546
+ if self.layer_norm_first:
1547
+ x = self.self_attn_layer_norm(x)
1548
+ x, attn, pos_bias = self.self_attn(
1549
+ query=x,
1550
+ key=x,
1551
+ value=x,
1552
+ key_padding_mask=self_attn_padding_mask,
1553
+ need_weights=False,
1554
+ attn_mask=self_attn_mask,
1555
+ position_bias=pos_bias,
1556
+ )
1557
+ x = self.dropout1(x)
1558
+ x = residual + x
1559
+
1560
+ residual = x
1561
+ x = self.final_layer_norm(x)
1562
+ if self.activation_name == "glu":
1563
+ x = self.fc1(x)
1564
+ else:
1565
+ x = self.activation_fn(self.fc1(x))
1566
+ x = self.dropout2(x)
1567
+ x = self.fc2(x)
1568
+ x = self.dropout3(x)
1569
+ x = residual + x
1570
+ else:
1571
+ x, attn, pos_bias = self.self_attn(
1572
+ query=x,
1573
+ key=x,
1574
+ value=x,
1575
+ key_padding_mask=self_attn_padding_mask,
1576
+ need_weights=need_weights,
1577
+ attn_mask=self_attn_mask,
1578
+ position_bias=pos_bias,
1579
+ )
1580
+
1581
+ x = self.dropout1(x)
1582
+ x = residual + x
1583
+
1584
+ x = self.self_attn_layer_norm(x)
1585
+
1586
+ residual = x
1587
+ if self.activation_name == "glu":
1588
+ x = self.fc1(x)
1589
+ else:
1590
+ x = self.activation_fn(self.fc1(x))
1591
+ x = self.dropout2(x)
1592
+ x = self.fc2(x)
1593
+ x = self.dropout3(x)
1594
+ x = residual + x
1595
+ x = self.final_layer_norm(x)
1596
+
1597
+ return x, attn, pos_bias