Spaces:
Sleeping
Sleeping
Commit
·
f6f30f3
0
Parent(s):
Initial commit
Browse files- .gitignore +164 -0
- CHANGELOG.md +0 -0
- CODE_OF_CONDUCT.md +80 -0
- CONTRIBUTING.md +31 -0
- LICENSE +395 -0
- README.md +111 -0
- evaluation_data/AES_PAM.jsonl +0 -0
- evaluation_data/AES_natural_music.jsonl +0 -0
- evaluation_data/AES_natural_sound.jsonl +0 -0
- evaluation_data/AES_natural_speech.jsonl +0 -0
- pyproject.toml +51 -0
- src/audiobox_aesthetics/__init__.py +0 -0
- src/audiobox_aesthetics/cli.py +113 -0
- src/audiobox_aesthetics/infer.py +214 -0
- src/audiobox_aesthetics/model/__init__.py +0 -0
- src/audiobox_aesthetics/model/aes_wavlm.py +175 -0
- src/audiobox_aesthetics/model/utils.py +25 -0
- src/audiobox_aesthetics/model/wavlm.py +1597 -0
.gitignore
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
# .idea/
|
163 |
+
.vscode/
|
164 |
+
.ruff_cache/
|
CHANGELOG.md
ADDED
File without changes
|
CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Code of Conduct
|
2 |
+
|
3 |
+
## Our Pledge
|
4 |
+
|
5 |
+
In the interest of fostering an open and welcoming environment, we as
|
6 |
+
contributors and maintainers pledge to make participation in our project and
|
7 |
+
our community a harassment-free experience for everyone, regardless of age, body
|
8 |
+
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
9 |
+
level of experience, education, socio-economic status, nationality, personal
|
10 |
+
appearance, race, religion, or sexual identity and orientation.
|
11 |
+
|
12 |
+
## Our Standards
|
13 |
+
|
14 |
+
Examples of behavior that contributes to creating a positive environment
|
15 |
+
include:
|
16 |
+
|
17 |
+
* Using welcoming and inclusive language
|
18 |
+
* Being respectful of differing viewpoints and experiences
|
19 |
+
* Gracefully accepting constructive criticism
|
20 |
+
* Focusing on what is best for the community
|
21 |
+
* Showing empathy towards other community members
|
22 |
+
|
23 |
+
Examples of unacceptable behavior by participants include:
|
24 |
+
|
25 |
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26 |
+
advances
|
27 |
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28 |
+
* Public or private harassment
|
29 |
+
* Publishing others' private information, such as a physical or electronic
|
30 |
+
address, without explicit permission
|
31 |
+
* Other conduct which could reasonably be considered inappropriate in a
|
32 |
+
professional setting
|
33 |
+
|
34 |
+
## Our Responsibilities
|
35 |
+
|
36 |
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37 |
+
behavior and are expected to take appropriate and fair corrective action in
|
38 |
+
response to any instances of unacceptable behavior.
|
39 |
+
|
40 |
+
Project maintainers have the right and responsibility to remove, edit, or
|
41 |
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42 |
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43 |
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44 |
+
threatening, offensive, or harmful.
|
45 |
+
|
46 |
+
## Scope
|
47 |
+
|
48 |
+
This Code of Conduct applies within all project spaces, and it also applies when
|
49 |
+
an individual is representing the project or its community in public spaces.
|
50 |
+
Examples of representing a project or community include using an official
|
51 |
+
project e-mail address, posting via an official social media account, or acting
|
52 |
+
as an appointed representative at an online or offline event. Representation of
|
53 |
+
a project may be further defined and clarified by project maintainers.
|
54 |
+
|
55 |
+
This Code of Conduct also applies outside the project spaces when there is a
|
56 |
+
reasonable belief that an individual's behavior may have a negative impact on
|
57 |
+
the project or its community.
|
58 |
+
|
59 |
+
## Enforcement
|
60 |
+
|
61 |
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
62 |
+
reported by contacting the project team at <[email protected]>. All
|
63 |
+
complaints will be reviewed and investigated and will result in a response that
|
64 |
+
is deemed necessary and appropriate to the circumstances. The project team is
|
65 |
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
66 |
+
Further details of specific enforcement policies may be posted separately.
|
67 |
+
|
68 |
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
69 |
+
faith may face temporary or permanent repercussions as determined by other
|
70 |
+
members of the project's leadership.
|
71 |
+
|
72 |
+
## Attribution
|
73 |
+
|
74 |
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
75 |
+
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
76 |
+
|
77 |
+
[homepage]: https://www.contributor-covenant.org
|
78 |
+
|
79 |
+
For answers to common questions about this code of conduct, see
|
80 |
+
https://www.contributor-covenant.org/faq
|
CONTRIBUTING.md
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contributing to audiobox-aesthetics
|
2 |
+
We want to make contributing to this project as easy and transparent as
|
3 |
+
possible.
|
4 |
+
|
5 |
+
## Pull Requests
|
6 |
+
We actively welcome your pull requests.
|
7 |
+
|
8 |
+
1. Fork the repo and create your branch from `main`.
|
9 |
+
2. If you've added code that should be tested, add tests.
|
10 |
+
3. If you've changed APIs, update the documentation.
|
11 |
+
4. Ensure the test suite passes.
|
12 |
+
5. Make sure your code lints.
|
13 |
+
6. If you haven't already, complete the Contributor License Agreement ("CLA").
|
14 |
+
|
15 |
+
## Contributor License Agreement ("CLA")
|
16 |
+
In order to accept your pull request, we need you to submit a CLA. You only need
|
17 |
+
to do this once to work on any of Meta's open source projects.
|
18 |
+
|
19 |
+
Complete your CLA here: <https://code.facebook.com/cla>
|
20 |
+
|
21 |
+
## Issues
|
22 |
+
We use GitHub issues to track public bugs. Please ensure your description is
|
23 |
+
clear and has sufficient instructions to be able to reproduce the issue.
|
24 |
+
|
25 |
+
Meta has a [bounty program](https://bugbounty.meta.com/) for the safe
|
26 |
+
disclosure of security bugs. In those cases, please go through the process
|
27 |
+
outlined on that page and do not file a public issue.
|
28 |
+
|
29 |
+
## License
|
30 |
+
By contributing to audiobox-aesthetics, you agree that your contributions will be licensed
|
31 |
+
under the LICENSE file in the root directory of this source tree.
|
LICENSE
ADDED
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Attribution 4.0 International
|
2 |
+
|
3 |
+
=======================================================================
|
4 |
+
|
5 |
+
Creative Commons Corporation ("Creative Commons") is not a law firm and
|
6 |
+
does not provide legal services or legal advice. Distribution of
|
7 |
+
Creative Commons public licenses does not create a lawyer-client or
|
8 |
+
other relationship. Creative Commons makes its licenses and related
|
9 |
+
information available on an "as-is" basis. Creative Commons gives no
|
10 |
+
warranties regarding its licenses, any material licensed under their
|
11 |
+
terms and conditions, or any related information. Creative Commons
|
12 |
+
disclaims all liability for damages resulting from their use to the
|
13 |
+
fullest extent possible.
|
14 |
+
|
15 |
+
Using Creative Commons Public Licenses
|
16 |
+
|
17 |
+
Creative Commons public licenses provide a standard set of terms and
|
18 |
+
conditions that creators and other rights holders may use to share
|
19 |
+
original works of authorship and other material subject to copyright
|
20 |
+
and certain other rights specified in the public license below. The
|
21 |
+
following considerations are for informational purposes only, are not
|
22 |
+
exhaustive, and do not form part of our licenses.
|
23 |
+
|
24 |
+
Considerations for licensors: Our public licenses are
|
25 |
+
intended for use by those authorized to give the public
|
26 |
+
permission to use material in ways otherwise restricted by
|
27 |
+
copyright and certain other rights. Our licenses are
|
28 |
+
irrevocable. Licensors should read and understand the terms
|
29 |
+
and conditions of the license they choose before applying it.
|
30 |
+
Licensors should also secure all rights necessary before
|
31 |
+
applying our licenses so that the public can reuse the
|
32 |
+
material as expected. Licensors should clearly mark any
|
33 |
+
material not subject to the license. This includes other CC-
|
34 |
+
licensed material, or material used under an exception or
|
35 |
+
limitation to copyright. More considerations for licensors:
|
36 |
+
wiki.creativecommons.org/Considerations_for_licensors
|
37 |
+
|
38 |
+
Considerations for the public: By using one of our public
|
39 |
+
licenses, a licensor grants the public permission to use the
|
40 |
+
licensed material under specified terms and conditions. If
|
41 |
+
the licensor's permission is not necessary for any reason--for
|
42 |
+
example, because of any applicable exception or limitation to
|
43 |
+
copyright--then that use is not regulated by the license. Our
|
44 |
+
licenses grant only permissions under copyright and certain
|
45 |
+
other rights that a licensor has authority to grant. Use of
|
46 |
+
the licensed material may still be restricted for other
|
47 |
+
reasons, including because others have copyright or other
|
48 |
+
rights in the material. A licensor may make special requests,
|
49 |
+
such as asking that all changes be marked or described.
|
50 |
+
Although not required by our licenses, you are encouraged to
|
51 |
+
respect those requests where reasonable. More considerations
|
52 |
+
for the public:
|
53 |
+
wiki.creativecommons.org/Considerations_for_licensees
|
54 |
+
|
55 |
+
=======================================================================
|
56 |
+
|
57 |
+
Creative Commons Attribution 4.0 International Public License
|
58 |
+
|
59 |
+
By exercising the Licensed Rights (defined below), You accept and agree
|
60 |
+
to be bound by the terms and conditions of this Creative Commons
|
61 |
+
Attribution 4.0 International Public License ("Public License"). To the
|
62 |
+
extent this Public License may be interpreted as a contract, You are
|
63 |
+
granted the Licensed Rights in consideration of Your acceptance of
|
64 |
+
these terms and conditions, and the Licensor grants You such rights in
|
65 |
+
consideration of benefits the Licensor receives from making the
|
66 |
+
Licensed Material available under these terms and conditions.
|
67 |
+
|
68 |
+
|
69 |
+
Section 1 -- Definitions.
|
70 |
+
|
71 |
+
a. Adapted Material means material subject to Copyright and Similar
|
72 |
+
Rights that is derived from or based upon the Licensed Material
|
73 |
+
and in which the Licensed Material is translated, altered,
|
74 |
+
arranged, transformed, or otherwise modified in a manner requiring
|
75 |
+
permission under the Copyright and Similar Rights held by the
|
76 |
+
Licensor. For purposes of this Public License, where the Licensed
|
77 |
+
Material is a musical work, performance, or sound recording,
|
78 |
+
Adapted Material is always produced where the Licensed Material is
|
79 |
+
synched in timed relation with a moving image.
|
80 |
+
|
81 |
+
b. Adapter's License means the license You apply to Your Copyright
|
82 |
+
and Similar Rights in Your contributions to Adapted Material in
|
83 |
+
accordance with the terms and conditions of this Public License.
|
84 |
+
|
85 |
+
c. Copyright and Similar Rights means copyright and/or similar rights
|
86 |
+
closely related to copyright including, without limitation,
|
87 |
+
performance, broadcast, sound recording, and Sui Generis Database
|
88 |
+
Rights, without regard to how the rights are labeled or
|
89 |
+
categorized. For purposes of this Public License, the rights
|
90 |
+
specified in Section 2(b)(1)-(2) are not Copyright and Similar
|
91 |
+
Rights.
|
92 |
+
|
93 |
+
d. Effective Technological Measures means those measures that, in the
|
94 |
+
absence of proper authority, may not be circumvented under laws
|
95 |
+
fulfilling obligations under Article 11 of the WIPO Copyright
|
96 |
+
Treaty adopted on December 20, 1996, and/or similar international
|
97 |
+
agreements.
|
98 |
+
|
99 |
+
e. Exceptions and Limitations means fair use, fair dealing, and/or
|
100 |
+
any other exception or limitation to Copyright and Similar Rights
|
101 |
+
that applies to Your use of the Licensed Material.
|
102 |
+
|
103 |
+
f. Licensed Material means the artistic or literary work, database,
|
104 |
+
or other material to which the Licensor applied this Public
|
105 |
+
License.
|
106 |
+
|
107 |
+
g. Licensed Rights means the rights granted to You subject to the
|
108 |
+
terms and conditions of this Public License, which are limited to
|
109 |
+
all Copyright and Similar Rights that apply to Your use of the
|
110 |
+
Licensed Material and that the Licensor has authority to license.
|
111 |
+
|
112 |
+
h. Licensor means the individual(s) or entity(ies) granting rights
|
113 |
+
under this Public License.
|
114 |
+
|
115 |
+
i. Share means to provide material to the public by any means or
|
116 |
+
process that requires permission under the Licensed Rights, such
|
117 |
+
as reproduction, public display, public performance, distribution,
|
118 |
+
dissemination, communication, or importation, and to make material
|
119 |
+
available to the public including in ways that members of the
|
120 |
+
public may access the material from a place and at a time
|
121 |
+
individually chosen by them.
|
122 |
+
|
123 |
+
j. Sui Generis Database Rights means rights other than copyright
|
124 |
+
resulting from Directive 96/9/EC of the European Parliament and of
|
125 |
+
the Council of 11 March 1996 on the legal protection of databases,
|
126 |
+
as amended and/or succeeded, as well as other essentially
|
127 |
+
equivalent rights anywhere in the world.
|
128 |
+
|
129 |
+
k. You means the individual or entity exercising the Licensed Rights
|
130 |
+
under this Public License. Your has a corresponding meaning.
|
131 |
+
|
132 |
+
|
133 |
+
Section 2 -- Scope.
|
134 |
+
|
135 |
+
a. License grant.
|
136 |
+
|
137 |
+
1. Subject to the terms and conditions of this Public License,
|
138 |
+
the Licensor hereby grants You a worldwide, royalty-free,
|
139 |
+
non-sublicensable, non-exclusive, irrevocable license to
|
140 |
+
exercise the Licensed Rights in the Licensed Material to:
|
141 |
+
|
142 |
+
a. reproduce and Share the Licensed Material, in whole or
|
143 |
+
in part; and
|
144 |
+
|
145 |
+
b. produce, reproduce, and Share Adapted Material.
|
146 |
+
|
147 |
+
2. Exceptions and Limitations. For the avoidance of doubt, where
|
148 |
+
Exceptions and Limitations apply to Your use, this Public
|
149 |
+
License does not apply, and You do not need to comply with
|
150 |
+
its terms and conditions.
|
151 |
+
|
152 |
+
3. Term. The term of this Public License is specified in Section
|
153 |
+
6(a).
|
154 |
+
|
155 |
+
4. Media and formats; technical modifications allowed. The
|
156 |
+
Licensor authorizes You to exercise the Licensed Rights in
|
157 |
+
all media and formats whether now known or hereafter created,
|
158 |
+
and to make technical modifications necessary to do so. The
|
159 |
+
Licensor waives and/or agrees not to assert any right or
|
160 |
+
authority to forbid You from making technical modifications
|
161 |
+
necessary to exercise the Licensed Rights, including
|
162 |
+
technical modifications necessary to circumvent Effective
|
163 |
+
Technological Measures. For purposes of this Public License,
|
164 |
+
simply making modifications authorized by this Section 2(a)
|
165 |
+
(4) never produces Adapted Material.
|
166 |
+
|
167 |
+
5. Downstream recipients.
|
168 |
+
|
169 |
+
a. Offer from the Licensor -- Licensed Material. Every
|
170 |
+
recipient of the Licensed Material automatically
|
171 |
+
receives an offer from the Licensor to exercise the
|
172 |
+
Licensed Rights under the terms and conditions of this
|
173 |
+
Public License.
|
174 |
+
|
175 |
+
b. No downstream restrictions. You may not offer or impose
|
176 |
+
any additional or different terms or conditions on, or
|
177 |
+
apply any Effective Technological Measures to, the
|
178 |
+
Licensed Material if doing so restricts exercise of the
|
179 |
+
Licensed Rights by any recipient of the Licensed
|
180 |
+
Material.
|
181 |
+
|
182 |
+
6. No endorsement. Nothing in this Public License constitutes or
|
183 |
+
may be construed as permission to assert or imply that You
|
184 |
+
are, or that Your use of the Licensed Material is, connected
|
185 |
+
with, or sponsored, endorsed, or granted official status by,
|
186 |
+
the Licensor or others designated to receive attribution as
|
187 |
+
provided in Section 3(a)(1)(A)(i).
|
188 |
+
|
189 |
+
b. Other rights.
|
190 |
+
|
191 |
+
1. Moral rights, such as the right of integrity, are not
|
192 |
+
licensed under this Public License, nor are publicity,
|
193 |
+
privacy, and/or other similar personality rights; however, to
|
194 |
+
the extent possible, the Licensor waives and/or agrees not to
|
195 |
+
assert any such rights held by the Licensor to the limited
|
196 |
+
extent necessary to allow You to exercise the Licensed
|
197 |
+
Rights, but not otherwise.
|
198 |
+
|
199 |
+
2. Patent and trademark rights are not licensed under this
|
200 |
+
Public License.
|
201 |
+
|
202 |
+
3. To the extent possible, the Licensor waives any right to
|
203 |
+
collect royalties from You for the exercise of the Licensed
|
204 |
+
Rights, whether directly or through a collecting society
|
205 |
+
under any voluntary or waivable statutory or compulsory
|
206 |
+
licensing scheme. In all other cases the Licensor expressly
|
207 |
+
reserves any right to collect such royalties.
|
208 |
+
|
209 |
+
|
210 |
+
Section 3 -- License Conditions.
|
211 |
+
|
212 |
+
Your exercise of the Licensed Rights is expressly made subject to the
|
213 |
+
following conditions.
|
214 |
+
|
215 |
+
a. Attribution.
|
216 |
+
|
217 |
+
1. If You Share the Licensed Material (including in modified
|
218 |
+
form), You must:
|
219 |
+
|
220 |
+
a. retain the following if it is supplied by the Licensor
|
221 |
+
with the Licensed Material:
|
222 |
+
|
223 |
+
i. identification of the creator(s) of the Licensed
|
224 |
+
Material and any others designated to receive
|
225 |
+
attribution, in any reasonable manner requested by
|
226 |
+
the Licensor (including by pseudonym if
|
227 |
+
designated);
|
228 |
+
|
229 |
+
ii. a copyright notice;
|
230 |
+
|
231 |
+
iii. a notice that refers to this Public License;
|
232 |
+
|
233 |
+
iv. a notice that refers to the disclaimer of
|
234 |
+
warranties;
|
235 |
+
|
236 |
+
v. a URI or hyperlink to the Licensed Material to the
|
237 |
+
extent reasonably practicable;
|
238 |
+
|
239 |
+
b. indicate if You modified the Licensed Material and
|
240 |
+
retain an indication of any previous modifications; and
|
241 |
+
|
242 |
+
c. indicate the Licensed Material is licensed under this
|
243 |
+
Public License, and include the text of, or the URI or
|
244 |
+
hyperlink to, this Public License.
|
245 |
+
|
246 |
+
2. You may satisfy the conditions in Section 3(a)(1) in any
|
247 |
+
reasonable manner based on the medium, means, and context in
|
248 |
+
which You Share the Licensed Material. For example, it may be
|
249 |
+
reasonable to satisfy the conditions by providing a URI or
|
250 |
+
hyperlink to a resource that includes the required
|
251 |
+
information.
|
252 |
+
|
253 |
+
3. If requested by the Licensor, You must remove any of the
|
254 |
+
information required by Section 3(a)(1)(A) to the extent
|
255 |
+
reasonably practicable.
|
256 |
+
|
257 |
+
4. If You Share Adapted Material You produce, the Adapter's
|
258 |
+
License You apply must not prevent recipients of the Adapted
|
259 |
+
Material from complying with this Public License.
|
260 |
+
|
261 |
+
|
262 |
+
Section 4 -- Sui Generis Database Rights.
|
263 |
+
|
264 |
+
Where the Licensed Rights include Sui Generis Database Rights that
|
265 |
+
apply to Your use of the Licensed Material:
|
266 |
+
|
267 |
+
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
|
268 |
+
to extract, reuse, reproduce, and Share all or a substantial
|
269 |
+
portion of the contents of the database;
|
270 |
+
|
271 |
+
b. if You include all or a substantial portion of the database
|
272 |
+
contents in a database in which You have Sui Generis Database
|
273 |
+
Rights, then the database in which You have Sui Generis Database
|
274 |
+
Rights (but not its individual contents) is Adapted Material; and
|
275 |
+
|
276 |
+
c. You must comply with the conditions in Section 3(a) if You Share
|
277 |
+
all or a substantial portion of the contents of the database.
|
278 |
+
|
279 |
+
For the avoidance of doubt, this Section 4 supplements and does not
|
280 |
+
replace Your obligations under this Public License where the Licensed
|
281 |
+
Rights include other Copyright and Similar Rights.
|
282 |
+
|
283 |
+
|
284 |
+
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
|
285 |
+
|
286 |
+
a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
|
287 |
+
EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
|
288 |
+
AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
|
289 |
+
ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
|
290 |
+
IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
|
291 |
+
WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
292 |
+
PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
|
293 |
+
ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
|
294 |
+
KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
|
295 |
+
ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
|
296 |
+
|
297 |
+
b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
|
298 |
+
TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
|
299 |
+
NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
|
300 |
+
INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
|
301 |
+
COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
|
302 |
+
USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
|
303 |
+
ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
|
304 |
+
DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
|
305 |
+
IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
|
306 |
+
|
307 |
+
c. The disclaimer of warranties and limitation of liability provided
|
308 |
+
above shall be interpreted in a manner that, to the extent
|
309 |
+
possible, most closely approximates an absolute disclaimer and
|
310 |
+
waiver of all liability.
|
311 |
+
|
312 |
+
|
313 |
+
Section 6 -- Term and Termination.
|
314 |
+
|
315 |
+
a. This Public License applies for the term of the Copyright and
|
316 |
+
Similar Rights licensed here. However, if You fail to comply with
|
317 |
+
this Public License, then Your rights under this Public License
|
318 |
+
terminate automatically.
|
319 |
+
|
320 |
+
b. Where Your right to use the Licensed Material has terminated under
|
321 |
+
Section 6(a), it reinstates:
|
322 |
+
|
323 |
+
1. automatically as of the date the violation is cured, provided
|
324 |
+
it is cured within 30 days of Your discovery of the
|
325 |
+
violation; or
|
326 |
+
|
327 |
+
2. upon express reinstatement by the Licensor.
|
328 |
+
|
329 |
+
For the avoidance of doubt, this Section 6(b) does not affect any
|
330 |
+
right the Licensor may have to seek remedies for Your violations
|
331 |
+
of this Public License.
|
332 |
+
|
333 |
+
c. For the avoidance of doubt, the Licensor may also offer the
|
334 |
+
Licensed Material under separate terms or conditions or stop
|
335 |
+
distributing the Licensed Material at any time; however, doing so
|
336 |
+
will not terminate this Public License.
|
337 |
+
|
338 |
+
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
|
339 |
+
License.
|
340 |
+
|
341 |
+
|
342 |
+
Section 7 -- Other Terms and Conditions.
|
343 |
+
|
344 |
+
a. The Licensor shall not be bound by any additional or different
|
345 |
+
terms or conditions communicated by You unless expressly agreed.
|
346 |
+
|
347 |
+
b. Any arrangements, understandings, or agreements regarding the
|
348 |
+
Licensed Material not stated herein are separate from and
|
349 |
+
independent of the terms and conditions of this Public License.
|
350 |
+
|
351 |
+
|
352 |
+
Section 8 -- Interpretation.
|
353 |
+
|
354 |
+
a. For the avoidance of doubt, this Public License does not, and
|
355 |
+
shall not be interpreted to, reduce, limit, restrict, or impose
|
356 |
+
conditions on any use of the Licensed Material that could lawfully
|
357 |
+
be made without permission under this Public License.
|
358 |
+
|
359 |
+
b. To the extent possible, if any provision of this Public License is
|
360 |
+
deemed unenforceable, it shall be automatically reformed to the
|
361 |
+
minimum extent necessary to make it enforceable. If the provision
|
362 |
+
cannot be reformed, it shall be severed from this Public License
|
363 |
+
without affecting the enforceability of the remaining terms and
|
364 |
+
conditions.
|
365 |
+
|
366 |
+
c. No term or condition of this Public License will be waived and no
|
367 |
+
failure to comply consented to unless expressly agreed to by the
|
368 |
+
Licensor.
|
369 |
+
|
370 |
+
d. Nothing in this Public License constitutes or may be interpreted
|
371 |
+
as a limitation upon, or waiver of, any privileges and immunities
|
372 |
+
that apply to the Licensor or You, including from the legal
|
373 |
+
processes of any jurisdiction or authority.
|
374 |
+
|
375 |
+
|
376 |
+
=======================================================================
|
377 |
+
|
378 |
+
Creative Commons is not a party to its public
|
379 |
+
licenses. Notwithstanding, Creative Commons may elect to apply one of
|
380 |
+
its public licenses to material it publishes and in those instances
|
381 |
+
will be considered the “Licensor.” The text of the Creative Commons
|
382 |
+
public licenses is dedicated to the public domain under the CC0 Public
|
383 |
+
Domain Dedication. Except for the limited purpose of indicating that
|
384 |
+
material is shared under a Creative Commons public license or as
|
385 |
+
otherwise permitted by the Creative Commons policies published at
|
386 |
+
creativecommons.org/policies, Creative Commons does not authorize the
|
387 |
+
use of the trademark "Creative Commons" or any other trademark or logo
|
388 |
+
of Creative Commons without its prior written consent including,
|
389 |
+
without limitation, in connection with any unauthorized modifications
|
390 |
+
to any of its public licenses or any other arrangements,
|
391 |
+
understandings, or agreements concerning use of licensed material. For
|
392 |
+
the avoidance of doubt, this paragraph does not form part of the
|
393 |
+
public licenses.
|
394 |
+
|
395 |
+
Creative Commons may be contacted at creativecommons.org.
|
README.md
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# audiobox-aesthetics
|
2 |
+
|
3 |
+
Unified automatic quality assessment for speech, music, and sound.
|
4 |
+
|
5 |
+
[](Paper)
|
6 |
+
|
7 |
+
## Installation
|
8 |
+
|
9 |
+
This repository requires Python 3.9 and Pytorch 2.2 or greater. To install, you can clone this repo and run:
|
10 |
+
```
|
11 |
+
pip install -e .
|
12 |
+
```
|
13 |
+
|
14 |
+
## Pre-trained Models
|
15 |
+
|
16 |
+
Model | Link
|
17 |
+
|---|---|
|
18 |
+
All axes | [checkpoint.pt](https://dl.fbaipublicfiles.com/audiobox-aesthetics/checkpoint.pt)
|
19 |
+
|
20 |
+
## Usage
|
21 |
+
|
22 |
+
How to run prediction:
|
23 |
+
|
24 |
+
1. Create a jsonl files with the following format
|
25 |
+
```
|
26 |
+
{"path":"/path/to/a.wav"}
|
27 |
+
{"path":"/path/to/b.wav"}
|
28 |
+
...
|
29 |
+
{"path":"/path/to/z.wav"}
|
30 |
+
```
|
31 |
+
or if you only want to predict aesthetic score from certain timestamp
|
32 |
+
```
|
33 |
+
{"path":"/path/to/a.wav", "start_time":0, "end_time": 5}
|
34 |
+
{"path":"/path/to/b.wav", "start_time":3, "end_time": 10}
|
35 |
+
```
|
36 |
+
and save it as `input.jsonl`
|
37 |
+
|
38 |
+
2. Run following command
|
39 |
+
```
|
40 |
+
audio-aes input.jsonl --ckpt "/path/to/checkpoint.pt" > output.jsonl
|
41 |
+
```
|
42 |
+
|
43 |
+
3. Output file will contains same number of rows as `input.jsonl`. Each rows contains 4 axes prediction with JSON-formatted dictionary. Check following table for more info:
|
44 |
+
Axes name | Full name
|
45 |
+
|---|---|
|
46 |
+
CE | Content Enjoyment
|
47 |
+
CU | Content Usefulness
|
48 |
+
PC | Production Complexity
|
49 |
+
PQ | Production Quality
|
50 |
+
|
51 |
+
Output line example:
|
52 |
+
```
|
53 |
+
{"CE": 5.146, "CU": 5.779, "PC": 2.148, "PQ": 7.220}
|
54 |
+
```
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
4. (Extra) If you want to extract only one axis (i.e. CE), post-process the output file with following command using `jq` utility:
|
59 |
+
|
60 |
+
```jq '.CE' output.jsonl > output-aes_ce.txt```
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
## Evaluation dataset
|
65 |
+
We released our evaluation dataset consisted of 4 axes of aesthetic annotation scores.
|
66 |
+
|
67 |
+
Here, we show an example on how to read and re-map each annotation to the actual audio file.
|
68 |
+
```
|
69 |
+
{
|
70 |
+
"data_path": "/your_path/LibriTTS/train-clean-100/1363/139304/1363_139304_000011_000000.wav",
|
71 |
+
"Production_Quality": [8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 8.0, 5.0, 8.0, 8.0],
|
72 |
+
"Production_Complexity": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
|
73 |
+
"Content_Enjoyment": [8.0, 6.0, 8.0, 5.0, 8.0, 8.0, 8.0, 6.0, 8.0, 6.0],
|
74 |
+
"Content_Usefulness": [8.0, 6.0, 8.0, 7.0, 8.0, 9.0, 8.0, 6.0, 10.0, 7.0]
|
75 |
+
}
|
76 |
+
```
|
77 |
+
1. Recognize the dataset name from data_path. In the example, it is LibriTTS.
|
78 |
+
2. Replace "/your_path/" into your downloaded LibriTTS directory.
|
79 |
+
3. Each axes contains 10 scores annotated by 10 different human annotators.
|
80 |
+
|
81 |
+
data_path | URL
|
82 |
+
|---|---|
|
83 |
+
LibriTTS | https://openslr.org/60/
|
84 |
+
cv-corpus-13.0-2023-03-09 | https://commonvoice.mozilla.org/en/datasets
|
85 |
+
EARS | https://sp-uhh.github.io/ears_dataset/
|
86 |
+
MUSDB18 | https://sigsep.github.io/datasets/musdb.html
|
87 |
+
musiccaps | https://www.kaggle.com/datasets/googleai/musiccaps
|
88 |
+
(audioset) unbalanced_train_segments | https://research.google.com/audioset/dataset/index.html
|
89 |
+
PAM | https://zenodo.org/records/10737388
|
90 |
+
|
91 |
+
## License
|
92 |
+
The majority of audiobox-aesthetics is licensed under CC-BY 4.0, as found in the LICENSE file.
|
93 |
+
However, portions of the project are available under separate license terms: [https://github.com/microsoft/unilm](https://github.com/microsoft/unilm) is licensed under MIT license.
|
94 |
+
|
95 |
+
## Citation
|
96 |
+
If you found this repository useful, please use the following BibTeX entry.
|
97 |
+
|
98 |
+
```
|
99 |
+
@article{tjandra2025aes,
|
100 |
+
title={Meta Audiobox Aesthetics: Unified Automatic Quality Assessment for Speech, Music, and Sound},
|
101 |
+
author={Tjandra, Andros and Wu, Yi-Chiao and Guo, Baishan and Hoffman, John and Ellis, Brian and Vyas, Apoorv and Shi, Bowen and Chen, Sanyuan and Le, Matt and Zacharov, Nick and Wood, Carleigh and Lee, Ann and Hsu, Wei-ning},
|
102 |
+
publisher={Meta AI},
|
103 |
+
year={2025},
|
104 |
+
url={https://ai.meta.com/research/publications/meta-audiobox-aesthetics-unified-automatic-quality-assessment-for-speech-music-and-sound/}
|
105 |
+
}
|
106 |
+
```
|
107 |
+
|
108 |
+
## Acknowledgements
|
109 |
+
|
110 |
+
Part of model code are copied from [https://github.com/microsoft/unilm/tree/master/wavlm](WavLM).
|
111 |
+
|
evaluation_data/AES_PAM.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation_data/AES_natural_music.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation_data/AES_natural_sound.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation_data/AES_natural_speech.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools>=61.0"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[project]
|
6 |
+
name = "audiobox_aesthetics"
|
7 |
+
version = "0.0.1"
|
8 |
+
authors = [
|
9 |
+
{name="Andros Tjandra", email="[email protected]"},
|
10 |
+
{name="Yi-Chiao Wu"},
|
11 |
+
{name="Baishan Guo"},
|
12 |
+
{name="John Hoffman"},
|
13 |
+
{name="Brian Ellis"},
|
14 |
+
{name="Apoorv Vyas"},
|
15 |
+
{name="Bowen Shi"},
|
16 |
+
{name="Sanyuan Chen"},
|
17 |
+
{name="Matt Le"},
|
18 |
+
{name="Nick Zacharov"},
|
19 |
+
{name="Carleigh Wood"},
|
20 |
+
{name="Ann Lee"},
|
21 |
+
{name="Wei-ning Hsu"}
|
22 |
+
]
|
23 |
+
maintainers = [
|
24 |
+
{name="Andros Tjandra", email="[email protected]"}
|
25 |
+
]
|
26 |
+
description = "Unified automatic quality assessment for speech, music, and sound."
|
27 |
+
requires-python = ">=3.9"
|
28 |
+
classifiers = [
|
29 |
+
"Programming Language :: Python :: 3",
|
30 |
+
"Operating System :: OS Independent",
|
31 |
+
]
|
32 |
+
readme = "README.md"
|
33 |
+
license = {file = "LICENSE"}
|
34 |
+
|
35 |
+
dependencies = [
|
36 |
+
"numpy",
|
37 |
+
"torch>=2.2.0",
|
38 |
+
"torchaudio",
|
39 |
+
"tqdm",
|
40 |
+
"iopath",
|
41 |
+
"submitit"
|
42 |
+
]
|
43 |
+
|
44 |
+
[project.scripts]
|
45 |
+
audio-aes = "audiobox_aesthetics.cli:app"
|
46 |
+
|
47 |
+
[project.urls]
|
48 |
+
Homepage = "https://github.com/pypa/sampleproject"
|
49 |
+
Issues = "https://github.com/pypa/sampleproject/issues"
|
50 |
+
|
51 |
+
|
src/audiobox_aesthetics/__init__.py
ADDED
File without changes
|
src/audiobox_aesthetics/cli.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
import argparse
|
8 |
+
from functools import partial
|
9 |
+
import itertools
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
import submitit
|
13 |
+
from .infer import load_dataset, main_predict
|
14 |
+
|
15 |
+
|
16 |
+
def parse_args():
|
17 |
+
parser = argparse.ArgumentParser("CLI for audiobox-aesthetics inference")
|
18 |
+
parser.add_argument("input_file", type=str)
|
19 |
+
parser.add_argument("--ckpt", type=str, required=True)
|
20 |
+
parser.add_argument("--batch-size", type=int, default=100)
|
21 |
+
parser.add_argument(
|
22 |
+
"--remote", action="store_true", default=False, help="Set true to run via SLURM"
|
23 |
+
)
|
24 |
+
|
25 |
+
# remote == True
|
26 |
+
parser.add_argument(
|
27 |
+
"--job-dir", default="/tmp", type=str, help="Slurm job directory"
|
28 |
+
)
|
29 |
+
parser.add_argument(
|
30 |
+
"--partition", default="learn", type=str, help="Slurm partition"
|
31 |
+
)
|
32 |
+
parser.add_argument("--qos", default="", type=str, help="Slurm QOS")
|
33 |
+
parser.add_argument("--account", default="", type=str, help="Slurm account")
|
34 |
+
parser.add_argument("--comment", default="", type=str, help="Slurm job comment")
|
35 |
+
parser.add_argument(
|
36 |
+
"--constraint",
|
37 |
+
default="",
|
38 |
+
type=str,
|
39 |
+
help="Slurm constraint eg.: ampere80gb For using A100s or volta32gb for using V100s.",
|
40 |
+
)
|
41 |
+
parser.add_argument(
|
42 |
+
"--exclude",
|
43 |
+
default="",
|
44 |
+
type=str,
|
45 |
+
help="Exclude certain nodes from the slurm job.",
|
46 |
+
)
|
47 |
+
parser.add_argument(
|
48 |
+
"--array", default=100, type=int, help="Slurm max array parallelism"
|
49 |
+
)
|
50 |
+
parser.add_argument(
|
51 |
+
"--chunk", default=1000, type=int, help="chunk size per instance"
|
52 |
+
)
|
53 |
+
return parser.parse_args()
|
54 |
+
|
55 |
+
|
56 |
+
def app():
|
57 |
+
args = parse_args()
|
58 |
+
|
59 |
+
metadata = load_dataset(args.input_file, 0, 2**64)
|
60 |
+
fn_wrapped = partial(main_predict, batch_size=args.batch_size, ckpt=args.ckpt)
|
61 |
+
|
62 |
+
if args.remote:
|
63 |
+
# chunk metadata
|
64 |
+
chunksize = args.chunk
|
65 |
+
chunked = [
|
66 |
+
metadata[ii : ii + chunksize] for ii in range(0, len(metadata), chunksize)
|
67 |
+
]
|
68 |
+
|
69 |
+
job_dir = Path(args.job_dir)
|
70 |
+
job_dir.mkdir(exist_ok=True)
|
71 |
+
|
72 |
+
executor = submitit.AutoExecutor(folder=f"{job_dir}/%A/")
|
73 |
+
|
74 |
+
kwargs = {}
|
75 |
+
if len(args.constraint):
|
76 |
+
kwargs["slurm_constraint"] = args.constraint
|
77 |
+
if args.comment:
|
78 |
+
kwargs["slurm_comment"] = args.comment
|
79 |
+
if args.qos:
|
80 |
+
kwargs["slurm_qos"] = args.qos
|
81 |
+
if args.account:
|
82 |
+
kwargs["slurm_account"] = args.account
|
83 |
+
|
84 |
+
# Set the parameters for the Slurm job
|
85 |
+
executor.update_parameters(
|
86 |
+
slurm_nodes=1,
|
87 |
+
slurm_gpus_per_node=1,
|
88 |
+
slurm_tasks_per_node=1,
|
89 |
+
slurm_cpus_per_task=10,
|
90 |
+
timeout_min=60 * 20, # max is 20 hours
|
91 |
+
slurm_array_parallelism=min(
|
92 |
+
len(chunked), args.array
|
93 |
+
), # number of tasks in the array job
|
94 |
+
slurm_partition=args.partition,
|
95 |
+
slurm_exclude=args.exclude,
|
96 |
+
**kwargs,
|
97 |
+
)
|
98 |
+
|
99 |
+
jobs = executor.map_array(fn_wrapped, chunked)
|
100 |
+
outputs = [job.result() for job in jobs]
|
101 |
+
|
102 |
+
outputs = itertools.chain(*outputs)
|
103 |
+
else:
|
104 |
+
outputs = fn_wrapped(metadata)
|
105 |
+
print("\n".join(str(x) for x in outputs))
|
106 |
+
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
"""
|
110 |
+
Example usage:
|
111 |
+
python cli.py input.jsonl --batch-size 100 --ckpt /path/to/ckpt > output.jsonl
|
112 |
+
"""
|
113 |
+
app()
|
src/audiobox_aesthetics/infer.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
from dataclasses import dataclass
|
8 |
+
import json
|
9 |
+
import re
|
10 |
+
import sys
|
11 |
+
from typing import Any, Dict, List
|
12 |
+
from tqdm import tqdm
|
13 |
+
from iopath import PathManager
|
14 |
+
import torch
|
15 |
+
import torchaudio
|
16 |
+
import torch.nn.functional as F
|
17 |
+
|
18 |
+
from .model.aes_wavlm import Normalize, WavlmAudioEncoderMultiOutput
|
19 |
+
|
20 |
+
Batch = Dict[str, Any]
|
21 |
+
|
22 |
+
|
23 |
+
def read_wav(meta):
|
24 |
+
path = meta["path"]
|
25 |
+
|
26 |
+
if "start_time" in meta:
|
27 |
+
start = meta["start_time"]
|
28 |
+
end = meta["end_time"]
|
29 |
+
sr = torchaudio.info(path).sample_rate
|
30 |
+
wav, _ = torchaudio.load(
|
31 |
+
path, frame_offset=start * sr, num_frames=(end - start) * sr
|
32 |
+
)
|
33 |
+
else:
|
34 |
+
wav, sr = torchaudio.load(path)
|
35 |
+
|
36 |
+
if wav.shape[0] > 1:
|
37 |
+
wav = wav.mean(0, keepdim=True)
|
38 |
+
|
39 |
+
return wav, sr
|
40 |
+
|
41 |
+
|
42 |
+
def make_inference_batch(
|
43 |
+
input_wavs: list,
|
44 |
+
hop_size=10,
|
45 |
+
window_size=10,
|
46 |
+
sample_rate=16000,
|
47 |
+
pad_zero=True,
|
48 |
+
):
|
49 |
+
wavs = []
|
50 |
+
masks = []
|
51 |
+
weights = []
|
52 |
+
bids = []
|
53 |
+
offset = hop_size * sample_rate
|
54 |
+
winlen = window_size * sample_rate
|
55 |
+
for bid, wav in enumerate(input_wavs):
|
56 |
+
for ii in range(0, wav.shape[-1], offset):
|
57 |
+
wav_ii = wav[..., ii : ii + winlen]
|
58 |
+
wav_ii_len = wav_ii.shape[-1]
|
59 |
+
if wav_ii_len < winlen and pad_zero:
|
60 |
+
wav_ii = F.pad(wav_ii, (0, winlen - wav_ii_len))
|
61 |
+
mask_ii = torch.zeros_like(wav_ii, dtype=torch.bool)
|
62 |
+
mask_ii[:, 0:wav_ii_len] = True
|
63 |
+
wavs.append(wav_ii)
|
64 |
+
masks.append(mask_ii)
|
65 |
+
weights.append(wav_ii_len / winlen)
|
66 |
+
bids.append(bid)
|
67 |
+
return wavs, masks, weights, bids
|
68 |
+
|
69 |
+
|
70 |
+
AXES_NAME = ["CE", "CU", "PC", "PQ"]
|
71 |
+
|
72 |
+
|
73 |
+
@dataclass
|
74 |
+
class AesWavlmPredictorMultiOutput:
|
75 |
+
checkpoint_pth: str
|
76 |
+
precision: str = "bf16"
|
77 |
+
batch_size: int = 1
|
78 |
+
data_col: str = "path"
|
79 |
+
sample_rate: int = 16000 # const
|
80 |
+
|
81 |
+
def setup_model(self):
|
82 |
+
# This method gets called before inference starts
|
83 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
84 |
+
self.path_manager = PathManager()
|
85 |
+
print(f"Setting up Aesthetic model on {self.device}", file=sys.stderr)
|
86 |
+
|
87 |
+
with self.path_manager.open(self.checkpoint_pth, "rb") as fin:
|
88 |
+
ckpt = torch.load(fin, map_location=self.device)
|
89 |
+
state_dict = {
|
90 |
+
re.sub("^model.", "", k): v for (k, v) in ckpt["state_dict"].items()
|
91 |
+
}
|
92 |
+
model = WavlmAudioEncoderMultiOutput(
|
93 |
+
**{
|
94 |
+
k: ckpt["model_cfg"][k]
|
95 |
+
for k in [
|
96 |
+
"proj_num_layer",
|
97 |
+
"proj_ln",
|
98 |
+
"proj_act_fn",
|
99 |
+
"proj_dropout",
|
100 |
+
"nth_layer",
|
101 |
+
"use_weighted_layer_sum",
|
102 |
+
"precision",
|
103 |
+
"normalize_embed",
|
104 |
+
"output_dim",
|
105 |
+
]
|
106 |
+
}
|
107 |
+
)
|
108 |
+
model.load_state_dict(state_dict)
|
109 |
+
model.to(self.device)
|
110 |
+
model.eval()
|
111 |
+
|
112 |
+
self.model = model
|
113 |
+
self.dtype = {
|
114 |
+
"16": torch.float16,
|
115 |
+
"bf16": torch.bfloat16,
|
116 |
+
}.get(self.precision)
|
117 |
+
|
118 |
+
self.target_transform = {
|
119 |
+
axis: Normalize(
|
120 |
+
mean=ckpt["target_transform"][axis]["mean"],
|
121 |
+
std=ckpt["target_transform"][axis]["std"],
|
122 |
+
)
|
123 |
+
for axis in AXES_NAME
|
124 |
+
}
|
125 |
+
|
126 |
+
def audio_resample_mono(self, data_list: List[Batch]) -> List:
|
127 |
+
wavs = []
|
128 |
+
for ii, item in enumerate(data_list):
|
129 |
+
if isinstance(item[self.data_col], str):
|
130 |
+
# wav, sr = torchaudio.load(item[self.data_col])
|
131 |
+
wav, sr = read_wav(item)
|
132 |
+
else:
|
133 |
+
wav = item[self.data_col]
|
134 |
+
sr = item["sample_rate"]
|
135 |
+
|
136 |
+
wav = torchaudio.functional.resample(
|
137 |
+
wav,
|
138 |
+
orig_freq=sr,
|
139 |
+
new_freq=self.sample_rate,
|
140 |
+
)
|
141 |
+
wav = wav.mean(dim=0, keepdim=True)
|
142 |
+
wavs.append(wav)
|
143 |
+
return wavs
|
144 |
+
|
145 |
+
def forward(self, batch):
|
146 |
+
with torch.inference_mode():
|
147 |
+
bsz = len(batch)
|
148 |
+
wavs = self.audio_resample_mono(batch)
|
149 |
+
wavs, masks, weights, bids = make_inference_batch(
|
150 |
+
wavs,
|
151 |
+
10,
|
152 |
+
10,
|
153 |
+
sample_rate=self.sample_rate,
|
154 |
+
)
|
155 |
+
|
156 |
+
# collate
|
157 |
+
wavs = torch.stack(wavs).to(self.device)
|
158 |
+
masks = torch.stack(masks).to(self.device)
|
159 |
+
weights = torch.tensor(weights).to(self.device)
|
160 |
+
bids = torch.tensor(bids).to(self.device)
|
161 |
+
|
162 |
+
assert wavs.shape[0] == masks.shape[0] == weights.shape[0] == bids.shape[0]
|
163 |
+
preds_all = self.model({"wav": wavs, "mask": masks})
|
164 |
+
all_result = {}
|
165 |
+
for axis in AXES_NAME:
|
166 |
+
preds = self.target_transform[axis].inverse(preds_all[axis])
|
167 |
+
weighted_preds = []
|
168 |
+
for bii in range(bsz):
|
169 |
+
weights_bii = weights[bids == bii]
|
170 |
+
weighted_preds.append(
|
171 |
+
(
|
172 |
+
(preds[bids == bii] * weights_bii).sum() / weights_bii.sum()
|
173 |
+
).item()
|
174 |
+
)
|
175 |
+
all_result[axis] = weighted_preds
|
176 |
+
# re-arrenge result
|
177 |
+
all_rows = [
|
178 |
+
dict(zip(all_result.keys(), vv)) for vv in zip(*all_result.values())
|
179 |
+
]
|
180 |
+
# convert to json str
|
181 |
+
all_rows = [json.dumps(x) for x in all_rows]
|
182 |
+
return all_rows
|
183 |
+
|
184 |
+
|
185 |
+
def load_dataset(path, start=None, end=None) -> List[Batch]:
|
186 |
+
metadata = []
|
187 |
+
with open(path) as fr:
|
188 |
+
for ii, fi in enumerate(fr):
|
189 |
+
if start <= ii < end:
|
190 |
+
fi = json.loads(fi)
|
191 |
+
metadata.append(fi)
|
192 |
+
return metadata
|
193 |
+
|
194 |
+
|
195 |
+
def main_predict(input_file, ckpt, batch_size=10):
|
196 |
+
predictor = AesWavlmPredictorMultiOutput(checkpoint_pth=ckpt, data_col="path")
|
197 |
+
|
198 |
+
predictor.setup_model()
|
199 |
+
|
200 |
+
# load file
|
201 |
+
if isinstance(input_file, str):
|
202 |
+
metadata = load_dataset(input_file, 0, 2**64)
|
203 |
+
else:
|
204 |
+
metadata = input_file
|
205 |
+
|
206 |
+
outputs = []
|
207 |
+
for ii in tqdm(range(0, len(metadata), batch_size)):
|
208 |
+
output = predictor.forward(metadata[ii : ii + batch_size])
|
209 |
+
outputs.extend(output)
|
210 |
+
assert len(outputs) == len(
|
211 |
+
metadata
|
212 |
+
), f"Output {len(outputs)} != input {len(metadata)} length"
|
213 |
+
|
214 |
+
return outputs
|
src/audiobox_aesthetics/model/__init__.py
ADDED
File without changes
|
src/audiobox_aesthetics/model/aes_wavlm.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
from dataclasses import dataclass
|
8 |
+
import sys
|
9 |
+
from torch import nn
|
10 |
+
import torch
|
11 |
+
|
12 |
+
from .utils import create_mlp_block
|
13 |
+
from .wavlm import WavLM, WavLMConfig
|
14 |
+
|
15 |
+
|
16 |
+
DEFAULT_AUDIO_CFG = WavLMConfig(
|
17 |
+
{
|
18 |
+
"extractor_mode": "default",
|
19 |
+
"encoder_layers": 12,
|
20 |
+
"encoder_embed_dim": 768,
|
21 |
+
"encoder_ffn_embed_dim": 3072,
|
22 |
+
"encoder_attention_heads": 12,
|
23 |
+
"activation_fn": "gelu",
|
24 |
+
"dropout": 0.1,
|
25 |
+
"attention_dropout": 0.1,
|
26 |
+
"activation_dropout": 0.0,
|
27 |
+
"encoder_layerdrop": 0.05,
|
28 |
+
"dropout_input": 0.1,
|
29 |
+
"dropout_features": 0.1,
|
30 |
+
"layer_norm_first": False,
|
31 |
+
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
|
32 |
+
"conv_bias": False,
|
33 |
+
"feature_grad_mult": 0.1,
|
34 |
+
"mask_length": 10,
|
35 |
+
"mask_prob": 0.8,
|
36 |
+
"mask_selection": "static",
|
37 |
+
"mask_other": 0.0,
|
38 |
+
"no_mask_overlap": False,
|
39 |
+
"mask_min_space": 1,
|
40 |
+
"mask_channel_length": 10,
|
41 |
+
"mask_channel_prob": 0.0,
|
42 |
+
"mask_channel_selection": "static",
|
43 |
+
"mask_channel_other": 0.0,
|
44 |
+
"no_mask_channel_overlap": False,
|
45 |
+
"mask_channel_min_space": 1,
|
46 |
+
"conv_pos": 128,
|
47 |
+
"conv_pos_groups": 16,
|
48 |
+
"relative_position_embedding": True,
|
49 |
+
"num_buckets": 320,
|
50 |
+
"max_distance": 800,
|
51 |
+
"gru_rel_pos": True,
|
52 |
+
"normalize": False,
|
53 |
+
}
|
54 |
+
)
|
55 |
+
|
56 |
+
|
57 |
+
@dataclass(eq=False)
|
58 |
+
class Normalize:
|
59 |
+
mean: float
|
60 |
+
std: float
|
61 |
+
|
62 |
+
def transform(self, x):
|
63 |
+
return (x - self.mean) / self.std
|
64 |
+
|
65 |
+
def inverse(self, x):
|
66 |
+
return x * self.std + self.mean
|
67 |
+
|
68 |
+
|
69 |
+
AXES_NAME = ["CE", "CU", "PC", "PQ"]
|
70 |
+
|
71 |
+
|
72 |
+
@dataclass(eq=False)
|
73 |
+
class WavlmAudioEncoderMultiOutput(nn.Module):
|
74 |
+
proj_num_layer: int = 1
|
75 |
+
proj_ln: bool = False
|
76 |
+
proj_act_fn: str = "gelu"
|
77 |
+
proj_dropout: float = 0
|
78 |
+
nth_layer: int = 13
|
79 |
+
use_weighted_layer_sum: bool = True
|
80 |
+
precision: str = "32"
|
81 |
+
normalize_embed: bool = True
|
82 |
+
output_dim: int = 1
|
83 |
+
|
84 |
+
def __post_init__(self):
|
85 |
+
super().__init__()
|
86 |
+
amodel_cfg = DEFAULT_AUDIO_CFG
|
87 |
+
self.wavlm_model = WavLM(amodel_cfg)
|
88 |
+
wavlm_out_dim = self.wavlm_model.cfg.encoder_embed_dim
|
89 |
+
|
90 |
+
self.axes_name = AXES_NAME
|
91 |
+
self.proj_layer = nn.ModuleDict(
|
92 |
+
{
|
93 |
+
x: nn.Sequential(
|
94 |
+
*create_mlp_block(
|
95 |
+
wavlm_out_dim,
|
96 |
+
self.output_dim,
|
97 |
+
self.proj_num_layer,
|
98 |
+
self.proj_act_fn,
|
99 |
+
self.proj_ln,
|
100 |
+
dropout=self.proj_dropout,
|
101 |
+
)
|
102 |
+
)
|
103 |
+
for x in self.axes_name
|
104 |
+
}
|
105 |
+
)
|
106 |
+
if self.use_weighted_layer_sum:
|
107 |
+
self.layer_weights = nn.ParameterDict(
|
108 |
+
{
|
109 |
+
x: torch.nn.Parameter(torch.ones(self.nth_layer) / (self.nth_layer))
|
110 |
+
for x in self.axes_name
|
111 |
+
}
|
112 |
+
)
|
113 |
+
|
114 |
+
precision_map = {
|
115 |
+
"64": torch.float64,
|
116 |
+
"32": torch.float32,
|
117 |
+
"16": torch.half,
|
118 |
+
"bf16": torch.bfloat16,
|
119 |
+
}
|
120 |
+
self.precision = precision_map[str(self.precision)]
|
121 |
+
self.enable_autocast = str(self.precision) in {"16", "bf16"}
|
122 |
+
print(
|
123 |
+
f"precision: {self.precision}, enable autocast: {self.enable_autocast}",
|
124 |
+
file=sys.stderr,
|
125 |
+
)
|
126 |
+
|
127 |
+
def forward(self, batch):
|
128 |
+
assert batch["wav"].ndim == 3
|
129 |
+
|
130 |
+
# frames: [B, C, T]
|
131 |
+
wav = batch["wav"].squeeze(1)
|
132 |
+
|
133 |
+
if "mask" in batch:
|
134 |
+
padding_mask = ~batch["mask"].squeeze(1)
|
135 |
+
else:
|
136 |
+
padding_mask = torch.zeros_like(wav, dtype=torch.bool)
|
137 |
+
|
138 |
+
with (
|
139 |
+
torch.amp.autocast(
|
140 |
+
device_type=wav.device.type,
|
141 |
+
dtype=self.precision,
|
142 |
+
enabled=self.enable_autocast,
|
143 |
+
),
|
144 |
+
torch.no_grad(),
|
145 |
+
):
|
146 |
+
if self.wavlm_model.cfg.normalize:
|
147 |
+
wav = torch.nn.functional.layer_norm(wav, wav.shape)
|
148 |
+
(_, all_outputs), embed_padding_mask = self.wavlm_model.extract_features(
|
149 |
+
source=wav,
|
150 |
+
padding_mask=padding_mask,
|
151 |
+
output_layer=self.nth_layer,
|
152 |
+
ret_layer_results=True,
|
153 |
+
)
|
154 |
+
all_outputs = torch.stack([gg[0] for gg in all_outputs], dim=-1) # T B C L
|
155 |
+
preds = {}
|
156 |
+
for name in self.axes_name:
|
157 |
+
if self.use_weighted_layer_sum:
|
158 |
+
norm_weights = torch.nn.functional.softmax(
|
159 |
+
self.layer_weights[name], dim=-1
|
160 |
+
) # L
|
161 |
+
audio_embed = torch.einsum("tbcl,l->btc", all_outputs, norm_weights)
|
162 |
+
else:
|
163 |
+
audio_embed = all_outputs[-1][0].transpose(1, 0)
|
164 |
+
|
165 |
+
embed_mask = (
|
166 |
+
(~embed_padding_mask).unsqueeze(dim=-1).type_as(audio_embed)
|
167 |
+
)
|
168 |
+
audio_embed = (audio_embed * embed_mask).sum(dim=1) / embed_mask.sum(
|
169 |
+
dim=1
|
170 |
+
).clamp(min=1)
|
171 |
+
if self.normalize_embed:
|
172 |
+
audio_embed = torch.nn.functional.normalize(audio_embed, dim=-1)
|
173 |
+
|
174 |
+
preds[name] = self.proj_layer[name](audio_embed).squeeze(-1)
|
175 |
+
return preds
|
src/audiobox_aesthetics/model/utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
6 |
+
|
7 |
+
from torch import nn
|
8 |
+
|
9 |
+
|
10 |
+
def create_mlp_block(input_dim, output_dim, num_layer, act_fn, layer_norm, dropout=0):
|
11 |
+
proj_layer = []
|
12 |
+
for ii in range(num_layer):
|
13 |
+
if ii == num_layer - 1:
|
14 |
+
proj_layer.append(nn.Linear(input_dim, output_dim))
|
15 |
+
else:
|
16 |
+
proj_layer.append(nn.Linear(input_dim, input_dim))
|
17 |
+
if layer_norm:
|
18 |
+
proj_layer.append(nn.LayerNorm(normalized_shape=(input_dim)))
|
19 |
+
if act_fn == "gelu":
|
20 |
+
proj_layer.append(nn.GELU())
|
21 |
+
else:
|
22 |
+
raise ValueError()
|
23 |
+
if dropout != 0:
|
24 |
+
proj_layer.append(nn.Dropout(p=dropout))
|
25 |
+
return proj_layer
|
src/audiobox_aesthetics/model/wavlm.py
ADDED
@@ -0,0 +1,1597 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
|
3 |
+
# Based on WavLM code
|
4 |
+
# --------------------------------------------------------
|
5 |
+
# WavLM: Large-Scale Self-Supervised Pre-training for Full Stack Speech Processing (https://arxiv.org/abs/2110.13900.pdf)
|
6 |
+
# Github source: https://github.com/microsoft/unilm/tree/master/wavlm
|
7 |
+
# Copyright (c) 2021 Microsoft
|
8 |
+
# Licensed under The MIT License [see LICENSE for details]
|
9 |
+
# Based on fairseq code bases
|
10 |
+
# https://github.com/pytorch/fairseq
|
11 |
+
# --------------------------------------------------------
|
12 |
+
|
13 |
+
import logging
|
14 |
+
import math
|
15 |
+
|
16 |
+
from typing import Dict, List, Optional, Tuple
|
17 |
+
|
18 |
+
import numpy as np
|
19 |
+
|
20 |
+
import warnings
|
21 |
+
import torch
|
22 |
+
import torch.nn as nn
|
23 |
+
import torch.nn.functional as F
|
24 |
+
from torch import Tensor
|
25 |
+
from torch.nn import LayerNorm, Parameter
|
26 |
+
|
27 |
+
|
28 |
+
logger = logging.getLogger(__name__)
|
29 |
+
|
30 |
+
|
31 |
+
class TransposeLast(nn.Module):
|
32 |
+
def __init__(self, deconstruct_idx=None):
|
33 |
+
super().__init__()
|
34 |
+
self.deconstruct_idx = deconstruct_idx
|
35 |
+
|
36 |
+
def forward(self, x):
|
37 |
+
if self.deconstruct_idx is not None:
|
38 |
+
x = x[self.deconstruct_idx]
|
39 |
+
return x.transpose(-2, -1)
|
40 |
+
|
41 |
+
|
42 |
+
class Fp32LayerNorm(nn.LayerNorm):
|
43 |
+
def __init__(self, *args, **kwargs):
|
44 |
+
super().__init__(*args, **kwargs)
|
45 |
+
|
46 |
+
def forward(self, input):
|
47 |
+
output = F.layer_norm(
|
48 |
+
input.float(),
|
49 |
+
self.normalized_shape,
|
50 |
+
self.weight.float() if self.weight is not None else None,
|
51 |
+
self.bias.float() if self.bias is not None else None,
|
52 |
+
self.eps,
|
53 |
+
)
|
54 |
+
return output.type_as(input)
|
55 |
+
|
56 |
+
|
57 |
+
class Fp32GroupNorm(nn.GroupNorm):
|
58 |
+
def __init__(self, *args, **kwargs):
|
59 |
+
super().__init__(*args, **kwargs)
|
60 |
+
|
61 |
+
def forward(self, input):
|
62 |
+
output = F.group_norm(
|
63 |
+
input.float(),
|
64 |
+
self.num_groups,
|
65 |
+
self.weight.float() if self.weight is not None else None,
|
66 |
+
self.bias.float() if self.bias is not None else None,
|
67 |
+
self.eps,
|
68 |
+
)
|
69 |
+
return output.type_as(input)
|
70 |
+
|
71 |
+
|
72 |
+
class GradMultiply(torch.autograd.Function):
|
73 |
+
@staticmethod
|
74 |
+
def forward(ctx, x, scale):
|
75 |
+
ctx.scale = scale
|
76 |
+
res = x.new(x)
|
77 |
+
return res
|
78 |
+
|
79 |
+
@staticmethod
|
80 |
+
def backward(ctx, grad):
|
81 |
+
return grad * ctx.scale, None
|
82 |
+
|
83 |
+
|
84 |
+
class SamePad(nn.Module):
|
85 |
+
def __init__(self, kernel_size, causal=False):
|
86 |
+
super().__init__()
|
87 |
+
if causal:
|
88 |
+
self.remove = kernel_size - 1
|
89 |
+
else:
|
90 |
+
self.remove = 1 if kernel_size % 2 == 0 else 0
|
91 |
+
|
92 |
+
def forward(self, x):
|
93 |
+
if self.remove > 0:
|
94 |
+
x = x[:, :, : -self.remove]
|
95 |
+
return x
|
96 |
+
|
97 |
+
|
98 |
+
class Swish(nn.Module):
|
99 |
+
"""Swish function"""
|
100 |
+
|
101 |
+
def __init__(self):
|
102 |
+
"""Construct an MultiHeadedAttention object."""
|
103 |
+
super(Swish, self).__init__()
|
104 |
+
self.act = torch.nn.Sigmoid()
|
105 |
+
|
106 |
+
def forward(self, x):
|
107 |
+
return x * self.act(x)
|
108 |
+
|
109 |
+
|
110 |
+
class GLU_Linear(nn.Module):
|
111 |
+
def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
|
112 |
+
super(GLU_Linear, self).__init__()
|
113 |
+
|
114 |
+
self.glu_type = glu_type
|
115 |
+
self.output_dim = output_dim
|
116 |
+
|
117 |
+
if glu_type == "sigmoid":
|
118 |
+
self.glu_act = torch.nn.Sigmoid()
|
119 |
+
elif glu_type == "swish":
|
120 |
+
self.glu_act = Swish()
|
121 |
+
elif glu_type == "relu":
|
122 |
+
self.glu_act = torch.nn.ReLU()
|
123 |
+
elif glu_type == "gelu":
|
124 |
+
self.glu_act = torch.nn.GELU()
|
125 |
+
|
126 |
+
if bias_in_glu:
|
127 |
+
self.linear = nn.Linear(input_dim, output_dim * 2, True)
|
128 |
+
else:
|
129 |
+
self.linear = nn.Linear(input_dim, output_dim * 2, False)
|
130 |
+
|
131 |
+
def forward(self, x):
|
132 |
+
# to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case
|
133 |
+
x = self.linear(x)
|
134 |
+
|
135 |
+
if self.glu_type == "bilinear":
|
136 |
+
x = (
|
137 |
+
x[:, :, 0 : self.output_dim]
|
138 |
+
* x[:, :, self.output_dim : self.output_dim * 2]
|
139 |
+
)
|
140 |
+
else:
|
141 |
+
x = x[:, :, 0 : self.output_dim] * self.glu_act(
|
142 |
+
x[:, :, self.output_dim : self.output_dim * 2]
|
143 |
+
)
|
144 |
+
|
145 |
+
return x
|
146 |
+
|
147 |
+
|
148 |
+
def gelu_accurate(x):
|
149 |
+
if not hasattr(gelu_accurate, "_a"):
|
150 |
+
gelu_accurate._a = math.sqrt(2 / math.pi)
|
151 |
+
return (
|
152 |
+
0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
|
153 |
+
)
|
154 |
+
|
155 |
+
|
156 |
+
def gelu(x: torch.Tensor) -> torch.Tensor:
|
157 |
+
return torch.nn.functional.gelu(x.float()).type_as(x)
|
158 |
+
|
159 |
+
|
160 |
+
def get_activation_fn(activation: str):
|
161 |
+
"""Returns the activation function corresponding to `activation`"""
|
162 |
+
|
163 |
+
if activation == "relu":
|
164 |
+
return F.relu
|
165 |
+
elif activation == "gelu":
|
166 |
+
return gelu
|
167 |
+
elif activation == "gelu_fast":
|
168 |
+
warnings.warn("--activation-fn=gelu_fast has been renamed to gelu_accurate")
|
169 |
+
return gelu_accurate
|
170 |
+
elif activation == "gelu_accurate":
|
171 |
+
return gelu_accurate
|
172 |
+
elif activation == "tanh":
|
173 |
+
return torch.tanh
|
174 |
+
elif activation == "linear":
|
175 |
+
return lambda x: x
|
176 |
+
elif activation == "glu":
|
177 |
+
return lambda x: x
|
178 |
+
else:
|
179 |
+
raise RuntimeError("--activation-fn {} not supported".format(activation))
|
180 |
+
|
181 |
+
|
182 |
+
def init_bert_params(module):
|
183 |
+
"""
|
184 |
+
Initialize the weights specific to the BERT Model.
|
185 |
+
This overrides the default initializations depending on the specified arguments.
|
186 |
+
1. If normal_init_linear_weights is set then weights of linear
|
187 |
+
layer will be initialized using the normal distribution and
|
188 |
+
bais will be set to the specified value.
|
189 |
+
2. If normal_init_embed_weights is set then weights of embedding
|
190 |
+
layer will be initialized using the normal distribution.
|
191 |
+
3. If normal_init_proj_weights is set then weights of
|
192 |
+
in_project_weight for MultiHeadAttention initialized using
|
193 |
+
the normal distribution (to be validated).
|
194 |
+
"""
|
195 |
+
|
196 |
+
def normal_(data):
|
197 |
+
# with FSDP, module params will be on CUDA, so we cast them back to CPU
|
198 |
+
# so that the RNG is consistent with and without FSDP
|
199 |
+
data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
|
200 |
+
|
201 |
+
if isinstance(module, nn.Linear):
|
202 |
+
normal_(module.weight.data)
|
203 |
+
if module.bias is not None:
|
204 |
+
module.bias.data.zero_()
|
205 |
+
if isinstance(module, nn.Embedding):
|
206 |
+
normal_(module.weight.data)
|
207 |
+
if module.padding_idx is not None:
|
208 |
+
module.weight.data[module.padding_idx].zero_()
|
209 |
+
if isinstance(module, MultiheadAttention):
|
210 |
+
normal_(module.q_proj.weight.data)
|
211 |
+
normal_(module.k_proj.weight.data)
|
212 |
+
normal_(module.v_proj.weight.data)
|
213 |
+
|
214 |
+
|
215 |
+
def quant_noise(module, p, block_size):
|
216 |
+
"""
|
217 |
+
Wraps modules and applies quantization noise to the weights for
|
218 |
+
subsequent quantization with Iterative Product Quantization as
|
219 |
+
described in "Training with Quantization Noise for Extreme Model Compression"
|
220 |
+
|
221 |
+
Args:
|
222 |
+
- module: nn.Module
|
223 |
+
- p: amount of Quantization Noise
|
224 |
+
- block_size: size of the blocks for subsequent quantization with iPQ
|
225 |
+
|
226 |
+
Remarks:
|
227 |
+
- Module weights must have the right sizes wrt the block size
|
228 |
+
- Only Linear, Embedding and Conv2d modules are supported for the moment
|
229 |
+
- For more detail on how to quantize by blocks with convolutional weights,
|
230 |
+
see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
|
231 |
+
- We implement the simplest form of noise here as stated in the paper
|
232 |
+
which consists in randomly dropping blocks
|
233 |
+
"""
|
234 |
+
|
235 |
+
# if no quantization noise, don't register hook
|
236 |
+
if p <= 0:
|
237 |
+
return module
|
238 |
+
|
239 |
+
# supported modules
|
240 |
+
assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
|
241 |
+
|
242 |
+
# test whether module.weight has the right sizes wrt block_size
|
243 |
+
is_conv = module.weight.ndim == 4
|
244 |
+
|
245 |
+
# 2D matrix
|
246 |
+
if not is_conv:
|
247 |
+
assert (
|
248 |
+
module.weight.size(1) % block_size == 0
|
249 |
+
), "Input features must be a multiple of block sizes"
|
250 |
+
|
251 |
+
# 4D matrix
|
252 |
+
else:
|
253 |
+
# 1x1 convolutions
|
254 |
+
if module.kernel_size == (1, 1):
|
255 |
+
assert (
|
256 |
+
module.in_channels % block_size == 0
|
257 |
+
), "Input channels must be a multiple of block sizes"
|
258 |
+
# regular convolutions
|
259 |
+
else:
|
260 |
+
k = module.kernel_size[0] * module.kernel_size[1]
|
261 |
+
assert k % block_size == 0, "Kernel size must be a multiple of block size"
|
262 |
+
|
263 |
+
def _forward_pre_hook(mod, input):
|
264 |
+
# no noise for evaluation
|
265 |
+
if mod.training:
|
266 |
+
if not is_conv:
|
267 |
+
# gather weight and sizes
|
268 |
+
weight = mod.weight
|
269 |
+
in_features = weight.size(1)
|
270 |
+
out_features = weight.size(0)
|
271 |
+
|
272 |
+
# split weight matrix into blocks and randomly drop selected blocks
|
273 |
+
mask = torch.zeros(
|
274 |
+
in_features // block_size * out_features, device=weight.device
|
275 |
+
)
|
276 |
+
mask.bernoulli_(p)
|
277 |
+
mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
|
278 |
+
|
279 |
+
else:
|
280 |
+
# gather weight and sizes
|
281 |
+
weight = mod.weight
|
282 |
+
in_channels = mod.in_channels
|
283 |
+
out_channels = mod.out_channels
|
284 |
+
|
285 |
+
# split weight matrix into blocks and randomly drop selected blocks
|
286 |
+
if mod.kernel_size == (1, 1):
|
287 |
+
mask = torch.zeros(
|
288 |
+
int(in_channels // block_size * out_channels),
|
289 |
+
device=weight.device,
|
290 |
+
)
|
291 |
+
mask.bernoulli_(p)
|
292 |
+
mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
|
293 |
+
else:
|
294 |
+
mask = torch.zeros(
|
295 |
+
weight.size(0), weight.size(1), device=weight.device
|
296 |
+
)
|
297 |
+
mask.bernoulli_(p)
|
298 |
+
mask = (
|
299 |
+
mask.unsqueeze(2)
|
300 |
+
.unsqueeze(3)
|
301 |
+
.repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
|
302 |
+
)
|
303 |
+
|
304 |
+
# scale weights and apply mask
|
305 |
+
mask = mask.to(
|
306 |
+
torch.bool
|
307 |
+
) # x.bool() is not currently supported in TorchScript
|
308 |
+
s = 1 / (1 - p)
|
309 |
+
mod.weight.data = s * weight.masked_fill(mask, 0)
|
310 |
+
|
311 |
+
module.register_forward_pre_hook(_forward_pre_hook)
|
312 |
+
return module
|
313 |
+
|
314 |
+
|
315 |
+
class MultiheadAttention(nn.Module):
|
316 |
+
"""Multi-headed attention.
|
317 |
+
|
318 |
+
See "Attention Is All You Need" for more details.
|
319 |
+
"""
|
320 |
+
|
321 |
+
def __init__(
|
322 |
+
self,
|
323 |
+
embed_dim,
|
324 |
+
num_heads,
|
325 |
+
kdim=None,
|
326 |
+
vdim=None,
|
327 |
+
dropout=0.0,
|
328 |
+
bias=True,
|
329 |
+
add_bias_kv=False,
|
330 |
+
add_zero_attn=False,
|
331 |
+
self_attention=False,
|
332 |
+
encoder_decoder_attention=False,
|
333 |
+
q_noise=0.0,
|
334 |
+
qn_block_size=8,
|
335 |
+
has_relative_attention_bias=False,
|
336 |
+
num_buckets=32,
|
337 |
+
max_distance=128,
|
338 |
+
gru_rel_pos=False,
|
339 |
+
rescale_init=False,
|
340 |
+
):
|
341 |
+
super().__init__()
|
342 |
+
self.embed_dim = embed_dim
|
343 |
+
self.kdim = kdim if kdim is not None else embed_dim
|
344 |
+
self.vdim = vdim if vdim is not None else embed_dim
|
345 |
+
self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
|
346 |
+
|
347 |
+
self.num_heads = num_heads
|
348 |
+
self.dropout_module = nn.Dropout(dropout)
|
349 |
+
|
350 |
+
self.has_relative_attention_bias = has_relative_attention_bias
|
351 |
+
self.num_buckets = num_buckets
|
352 |
+
self.max_distance = max_distance
|
353 |
+
if self.has_relative_attention_bias:
|
354 |
+
self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
|
355 |
+
|
356 |
+
self.head_dim = embed_dim // num_heads
|
357 |
+
self.q_head_dim = self.head_dim
|
358 |
+
self.k_head_dim = self.head_dim
|
359 |
+
assert (
|
360 |
+
self.head_dim * num_heads == self.embed_dim
|
361 |
+
), "embed_dim must be divisible by num_heads"
|
362 |
+
self.scaling = self.head_dim**-0.5
|
363 |
+
|
364 |
+
self.self_attention = self_attention
|
365 |
+
self.encoder_decoder_attention = encoder_decoder_attention
|
366 |
+
|
367 |
+
assert not self.self_attention or self.qkv_same_dim, (
|
368 |
+
"Self-attention requires query, key and " "value to be of the same size"
|
369 |
+
)
|
370 |
+
|
371 |
+
k_bias = True
|
372 |
+
if rescale_init:
|
373 |
+
k_bias = False
|
374 |
+
|
375 |
+
k_embed_dim = embed_dim
|
376 |
+
q_embed_dim = embed_dim
|
377 |
+
|
378 |
+
self.k_proj = quant_noise(
|
379 |
+
nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size
|
380 |
+
)
|
381 |
+
self.v_proj = quant_noise(
|
382 |
+
nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
|
383 |
+
)
|
384 |
+
self.q_proj = quant_noise(
|
385 |
+
nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size
|
386 |
+
)
|
387 |
+
|
388 |
+
self.out_proj = quant_noise(
|
389 |
+
nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
|
390 |
+
)
|
391 |
+
|
392 |
+
if add_bias_kv:
|
393 |
+
self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
|
394 |
+
self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
|
395 |
+
else:
|
396 |
+
self.bias_k = self.bias_v = None
|
397 |
+
|
398 |
+
self.add_zero_attn = add_zero_attn
|
399 |
+
|
400 |
+
self.gru_rel_pos = gru_rel_pos
|
401 |
+
if self.gru_rel_pos:
|
402 |
+
self.grep_linear = nn.Linear(self.q_head_dim, 8)
|
403 |
+
self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
|
404 |
+
|
405 |
+
self.reset_parameters()
|
406 |
+
|
407 |
+
def reset_parameters(self):
|
408 |
+
if self.qkv_same_dim:
|
409 |
+
# Empirically observed the convergence to be much better with
|
410 |
+
# the scaled initialization
|
411 |
+
nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
|
412 |
+
nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
|
413 |
+
nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
|
414 |
+
else:
|
415 |
+
nn.init.xavier_uniform_(self.k_proj.weight)
|
416 |
+
nn.init.xavier_uniform_(self.v_proj.weight)
|
417 |
+
nn.init.xavier_uniform_(self.q_proj.weight)
|
418 |
+
|
419 |
+
nn.init.xavier_uniform_(self.out_proj.weight)
|
420 |
+
if self.out_proj.bias is not None:
|
421 |
+
nn.init.constant_(self.out_proj.bias, 0.0)
|
422 |
+
if self.bias_k is not None:
|
423 |
+
nn.init.xavier_normal_(self.bias_k)
|
424 |
+
if self.bias_v is not None:
|
425 |
+
nn.init.xavier_normal_(self.bias_v)
|
426 |
+
if self.has_relative_attention_bias:
|
427 |
+
nn.init.xavier_normal_(self.relative_attention_bias.weight)
|
428 |
+
|
429 |
+
def _relative_positions_bucket(self, relative_positions, bidirectional=True):
|
430 |
+
num_buckets = self.num_buckets
|
431 |
+
max_distance = self.max_distance
|
432 |
+
relative_buckets = 0
|
433 |
+
|
434 |
+
if bidirectional:
|
435 |
+
num_buckets = num_buckets // 2
|
436 |
+
relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
|
437 |
+
relative_positions = torch.abs(relative_positions)
|
438 |
+
else:
|
439 |
+
relative_positions = -torch.min(
|
440 |
+
relative_positions, torch.zeros_like(relative_positions)
|
441 |
+
)
|
442 |
+
|
443 |
+
max_exact = num_buckets // 2
|
444 |
+
is_small = relative_positions < max_exact
|
445 |
+
|
446 |
+
relative_postion_if_large = max_exact + (
|
447 |
+
torch.log(relative_positions.float() / max_exact)
|
448 |
+
/ math.log(max_distance / max_exact)
|
449 |
+
* (num_buckets - max_exact)
|
450 |
+
).to(torch.long)
|
451 |
+
relative_postion_if_large = torch.min(
|
452 |
+
relative_postion_if_large,
|
453 |
+
torch.full_like(relative_postion_if_large, num_buckets - 1),
|
454 |
+
)
|
455 |
+
|
456 |
+
relative_buckets += torch.where(
|
457 |
+
is_small, relative_positions, relative_postion_if_large
|
458 |
+
)
|
459 |
+
return relative_buckets
|
460 |
+
|
461 |
+
def compute_bias(self, query_length, key_length):
|
462 |
+
context_position = torch.arange(query_length, dtype=torch.long)[:, None]
|
463 |
+
memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
|
464 |
+
relative_position = memory_position - context_position
|
465 |
+
relative_position_bucket = self._relative_positions_bucket(
|
466 |
+
relative_position, bidirectional=True
|
467 |
+
)
|
468 |
+
relative_position_bucket = relative_position_bucket.to(
|
469 |
+
self.relative_attention_bias.weight.device
|
470 |
+
)
|
471 |
+
values = self.relative_attention_bias(relative_position_bucket)
|
472 |
+
values = values.permute([2, 0, 1])
|
473 |
+
return values
|
474 |
+
|
475 |
+
def forward(
|
476 |
+
self,
|
477 |
+
query,
|
478 |
+
key: Optional[Tensor],
|
479 |
+
value: Optional[Tensor],
|
480 |
+
key_padding_mask: Optional[Tensor] = None,
|
481 |
+
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
|
482 |
+
need_weights: bool = True,
|
483 |
+
static_kv: bool = False,
|
484 |
+
attn_mask: Optional[Tensor] = None,
|
485 |
+
before_softmax: bool = False,
|
486 |
+
need_head_weights: bool = False,
|
487 |
+
position_bias: Optional[Tensor] = None,
|
488 |
+
) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
|
489 |
+
"""Input shape: Time x Batch x Channel
|
490 |
+
|
491 |
+
Args:
|
492 |
+
key_padding_mask (ByteTensor, optional): mask to exclude
|
493 |
+
keys that are pads, of shape `(batch, src_len)`, where
|
494 |
+
padding elements are indicated by 1s.
|
495 |
+
need_weights (bool, optional): return the attention weights,
|
496 |
+
averaged over heads (default: False).
|
497 |
+
attn_mask (ByteTensor, optional): typically used to
|
498 |
+
implement causal attention, where the mask prevents the
|
499 |
+
attention from looking forward in time (default: None).
|
500 |
+
before_softmax (bool, optional): return the raw attention
|
501 |
+
weights and values before the attention softmax.
|
502 |
+
need_head_weights (bool, optional): return the attention
|
503 |
+
weights for each head. Implies *need_weights*. Default:
|
504 |
+
return the average attention weights over all heads.
|
505 |
+
"""
|
506 |
+
if need_head_weights:
|
507 |
+
need_weights = True
|
508 |
+
|
509 |
+
is_tpu = query.device.type == "xla"
|
510 |
+
|
511 |
+
tgt_len, bsz, embed_dim = query.size()
|
512 |
+
src_len = tgt_len
|
513 |
+
assert embed_dim == self.embed_dim
|
514 |
+
assert list(query.size()) == [tgt_len, bsz, embed_dim]
|
515 |
+
if key is not None:
|
516 |
+
src_len, key_bsz, _ = key.size()
|
517 |
+
if not torch.jit.is_scripting():
|
518 |
+
assert key_bsz == bsz
|
519 |
+
assert value is not None
|
520 |
+
assert src_len, bsz == value.shape[:2]
|
521 |
+
|
522 |
+
if self.has_relative_attention_bias and position_bias is None:
|
523 |
+
position_bias = self.compute_bias(tgt_len, src_len)
|
524 |
+
position_bias = (
|
525 |
+
position_bias.unsqueeze(0)
|
526 |
+
.repeat(bsz, 1, 1, 1)
|
527 |
+
.view(bsz * self.num_heads, tgt_len, src_len)
|
528 |
+
)
|
529 |
+
|
530 |
+
if (
|
531 |
+
not is_tpu # don't use PyTorch version on TPUs
|
532 |
+
and incremental_state is None
|
533 |
+
and not static_kv
|
534 |
+
# A workaround for quantization to work. Otherwise JIT compilation
|
535 |
+
# treats bias in linear module as method.
|
536 |
+
and not torch.jit.is_scripting()
|
537 |
+
and self.q_head_dim == self.head_dim
|
538 |
+
):
|
539 |
+
assert key is not None and value is not None
|
540 |
+
assert attn_mask is None
|
541 |
+
|
542 |
+
attn_mask_rel_pos = None
|
543 |
+
if position_bias is not None:
|
544 |
+
attn_mask_rel_pos = position_bias
|
545 |
+
if self.gru_rel_pos:
|
546 |
+
query_layer = query.transpose(0, 1)
|
547 |
+
new_x_shape = query_layer.size()[:-1] + (self.num_heads, -1)
|
548 |
+
query_layer = query_layer.view(*new_x_shape)
|
549 |
+
query_layer = query_layer.permute(0, 2, 1, 3)
|
550 |
+
_B, _H, _L, __ = query_layer.size()
|
551 |
+
|
552 |
+
gate_a, gate_b = torch.sigmoid(
|
553 |
+
self.grep_linear(query_layer)
|
554 |
+
.view(_B, _H, _L, 2, 4)
|
555 |
+
.sum(-1, keepdim=False)
|
556 |
+
).chunk(2, dim=-1)
|
557 |
+
gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
|
558 |
+
attn_mask_rel_pos = (
|
559 |
+
gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
|
560 |
+
)
|
561 |
+
|
562 |
+
attn_mask_rel_pos = attn_mask_rel_pos.view((-1, tgt_len, tgt_len))
|
563 |
+
k_proj_bias = self.k_proj.bias
|
564 |
+
if k_proj_bias is None:
|
565 |
+
k_proj_bias = torch.zeros_like(self.q_proj.bias)
|
566 |
+
|
567 |
+
x, attn = F.multi_head_attention_forward(
|
568 |
+
query,
|
569 |
+
key,
|
570 |
+
value,
|
571 |
+
self.embed_dim,
|
572 |
+
self.num_heads,
|
573 |
+
torch.empty([0]),
|
574 |
+
torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
|
575 |
+
self.bias_k,
|
576 |
+
self.bias_v,
|
577 |
+
self.add_zero_attn,
|
578 |
+
self.dropout_module.p,
|
579 |
+
self.out_proj.weight,
|
580 |
+
self.out_proj.bias,
|
581 |
+
self.training,
|
582 |
+
# self.training or self.dropout_module.apply_during_inference,
|
583 |
+
key_padding_mask,
|
584 |
+
need_weights,
|
585 |
+
attn_mask_rel_pos,
|
586 |
+
use_separate_proj_weight=True,
|
587 |
+
q_proj_weight=self.q_proj.weight,
|
588 |
+
k_proj_weight=self.k_proj.weight,
|
589 |
+
v_proj_weight=self.v_proj.weight,
|
590 |
+
)
|
591 |
+
return x, attn, position_bias
|
592 |
+
|
593 |
+
if incremental_state is not None:
|
594 |
+
saved_state = self._get_input_buffer(incremental_state)
|
595 |
+
if saved_state is not None and "prev_key" in saved_state:
|
596 |
+
# previous time steps are cached - no need to recompute
|
597 |
+
# key and value if they are static
|
598 |
+
if static_kv:
|
599 |
+
assert self.encoder_decoder_attention and not self.self_attention
|
600 |
+
key = value = None
|
601 |
+
else:
|
602 |
+
saved_state = None
|
603 |
+
|
604 |
+
if self.self_attention:
|
605 |
+
q = self.q_proj(query)
|
606 |
+
k = self.k_proj(query)
|
607 |
+
v = self.v_proj(query)
|
608 |
+
elif self.encoder_decoder_attention:
|
609 |
+
# encoder-decoder attention
|
610 |
+
q = self.q_proj(query)
|
611 |
+
if key is None:
|
612 |
+
assert value is None
|
613 |
+
k = v = None
|
614 |
+
else:
|
615 |
+
k = self.k_proj(key)
|
616 |
+
v = self.v_proj(key)
|
617 |
+
|
618 |
+
else:
|
619 |
+
assert key is not None and value is not None
|
620 |
+
q = self.q_proj(query)
|
621 |
+
k = self.k_proj(key)
|
622 |
+
v = self.v_proj(value)
|
623 |
+
q *= self.scaling
|
624 |
+
|
625 |
+
if self.bias_k is not None:
|
626 |
+
assert self.bias_v is not None
|
627 |
+
k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
|
628 |
+
v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
|
629 |
+
if attn_mask is not None:
|
630 |
+
attn_mask = torch.cat(
|
631 |
+
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
|
632 |
+
)
|
633 |
+
if key_padding_mask is not None:
|
634 |
+
key_padding_mask = torch.cat(
|
635 |
+
[
|
636 |
+
key_padding_mask,
|
637 |
+
key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
|
638 |
+
],
|
639 |
+
dim=1,
|
640 |
+
)
|
641 |
+
|
642 |
+
q = (
|
643 |
+
q.contiguous()
|
644 |
+
.view(tgt_len, bsz * self.num_heads, self.q_head_dim)
|
645 |
+
.transpose(0, 1)
|
646 |
+
)
|
647 |
+
if k is not None:
|
648 |
+
k = (
|
649 |
+
k.contiguous()
|
650 |
+
.view(-1, bsz * self.num_heads, self.k_head_dim)
|
651 |
+
.transpose(0, 1)
|
652 |
+
)
|
653 |
+
if v is not None:
|
654 |
+
v = (
|
655 |
+
v.contiguous()
|
656 |
+
.view(-1, bsz * self.num_heads, self.head_dim)
|
657 |
+
.transpose(0, 1)
|
658 |
+
)
|
659 |
+
|
660 |
+
if saved_state is not None:
|
661 |
+
# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
|
662 |
+
if "prev_key" in saved_state:
|
663 |
+
_prev_key = saved_state["prev_key"]
|
664 |
+
assert _prev_key is not None
|
665 |
+
prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
|
666 |
+
if static_kv:
|
667 |
+
k = prev_key
|
668 |
+
else:
|
669 |
+
assert k is not None
|
670 |
+
k = torch.cat([prev_key, k], dim=1)
|
671 |
+
src_len = k.size(1)
|
672 |
+
if "prev_value" in saved_state:
|
673 |
+
_prev_value = saved_state["prev_value"]
|
674 |
+
assert _prev_value is not None
|
675 |
+
prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
|
676 |
+
if static_kv:
|
677 |
+
v = prev_value
|
678 |
+
else:
|
679 |
+
assert v is not None
|
680 |
+
v = torch.cat([prev_value, v], dim=1)
|
681 |
+
prev_key_padding_mask: Optional[Tensor] = None
|
682 |
+
if "prev_key_padding_mask" in saved_state:
|
683 |
+
prev_key_padding_mask = saved_state["prev_key_padding_mask"]
|
684 |
+
assert k is not None and v is not None
|
685 |
+
key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
|
686 |
+
key_padding_mask=key_padding_mask,
|
687 |
+
prev_key_padding_mask=prev_key_padding_mask,
|
688 |
+
batch_size=bsz,
|
689 |
+
src_len=k.size(1),
|
690 |
+
static_kv=static_kv,
|
691 |
+
)
|
692 |
+
|
693 |
+
saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
|
694 |
+
saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
|
695 |
+
saved_state["prev_key_padding_mask"] = key_padding_mask
|
696 |
+
# In this branch incremental_state is never None
|
697 |
+
assert incremental_state is not None
|
698 |
+
incremental_state = self._set_input_buffer(incremental_state, saved_state)
|
699 |
+
assert k is not None
|
700 |
+
assert k.size(1) == src_len
|
701 |
+
|
702 |
+
# This is part of a workaround to get around fork/join parallelism
|
703 |
+
# not supporting Optional types.
|
704 |
+
if key_padding_mask is not None and key_padding_mask.dim() == 0:
|
705 |
+
key_padding_mask = None
|
706 |
+
|
707 |
+
if key_padding_mask is not None:
|
708 |
+
assert key_padding_mask.size(0) == bsz
|
709 |
+
assert key_padding_mask.size(1) == src_len
|
710 |
+
|
711 |
+
if self.add_zero_attn:
|
712 |
+
assert v is not None
|
713 |
+
src_len += 1
|
714 |
+
k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
|
715 |
+
v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
|
716 |
+
if attn_mask is not None:
|
717 |
+
attn_mask = torch.cat(
|
718 |
+
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
|
719 |
+
)
|
720 |
+
if key_padding_mask is not None:
|
721 |
+
key_padding_mask = torch.cat(
|
722 |
+
[
|
723 |
+
key_padding_mask,
|
724 |
+
torch.zeros(key_padding_mask.size(0), 1).type_as(
|
725 |
+
key_padding_mask
|
726 |
+
),
|
727 |
+
],
|
728 |
+
dim=1,
|
729 |
+
)
|
730 |
+
|
731 |
+
attn_weights = torch.bmm(q, k.transpose(1, 2))
|
732 |
+
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
|
733 |
+
|
734 |
+
assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
|
735 |
+
|
736 |
+
if attn_mask is not None:
|
737 |
+
attn_mask = attn_mask.unsqueeze(0)
|
738 |
+
attn_weights += attn_mask
|
739 |
+
|
740 |
+
if key_padding_mask is not None:
|
741 |
+
# don't attend to padding symbols
|
742 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
743 |
+
if not is_tpu:
|
744 |
+
attn_weights = attn_weights.masked_fill(
|
745 |
+
key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
|
746 |
+
float("-inf"),
|
747 |
+
)
|
748 |
+
else:
|
749 |
+
attn_weights = attn_weights.transpose(0, 2)
|
750 |
+
attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
|
751 |
+
attn_weights = attn_weights.transpose(0, 2)
|
752 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
753 |
+
|
754 |
+
if before_softmax:
|
755 |
+
return attn_weights, v, position_bias
|
756 |
+
|
757 |
+
if position_bias is not None:
|
758 |
+
if self.gru_rel_pos == 1:
|
759 |
+
query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim)
|
760 |
+
_B, _H, _L, __ = query_layer.size()
|
761 |
+
gate_a, gate_b = torch.sigmoid(
|
762 |
+
self.grep_linear(query_layer)
|
763 |
+
.view(_B, _H, _L, 2, 4)
|
764 |
+
.sum(-1, keepdim=False)
|
765 |
+
).chunk(2, dim=-1)
|
766 |
+
gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
|
767 |
+
position_bias = (
|
768 |
+
gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
|
769 |
+
)
|
770 |
+
|
771 |
+
position_bias = position_bias.view(attn_weights.size())
|
772 |
+
|
773 |
+
attn_weights = attn_weights + position_bias
|
774 |
+
|
775 |
+
attn_weights_float = F.softmax(attn_weights, dim=-1)
|
776 |
+
attn_weights = attn_weights_float.type_as(attn_weights)
|
777 |
+
attn_probs = self.dropout_module(attn_weights)
|
778 |
+
|
779 |
+
assert v is not None
|
780 |
+
attn = torch.bmm(attn_probs, v)
|
781 |
+
assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
|
782 |
+
attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
|
783 |
+
attn = self.out_proj(attn)
|
784 |
+
attn_weights: Optional[Tensor] = None
|
785 |
+
if need_weights:
|
786 |
+
attn_weights = attn_weights_float.view(
|
787 |
+
bsz, self.num_heads, tgt_len, src_len
|
788 |
+
).transpose(1, 0)
|
789 |
+
if not need_head_weights:
|
790 |
+
# average attention weights over heads
|
791 |
+
attn_weights = attn_weights.mean(dim=0)
|
792 |
+
|
793 |
+
return attn, attn_weights, position_bias
|
794 |
+
|
795 |
+
@staticmethod
|
796 |
+
def _append_prev_key_padding_mask(
|
797 |
+
key_padding_mask: Optional[Tensor],
|
798 |
+
prev_key_padding_mask: Optional[Tensor],
|
799 |
+
batch_size: int,
|
800 |
+
src_len: int,
|
801 |
+
static_kv: bool,
|
802 |
+
) -> Optional[Tensor]:
|
803 |
+
# saved key padding masks have shape (bsz, seq_len)
|
804 |
+
if prev_key_padding_mask is not None and static_kv:
|
805 |
+
new_key_padding_mask = prev_key_padding_mask
|
806 |
+
elif prev_key_padding_mask is not None and key_padding_mask is not None:
|
807 |
+
new_key_padding_mask = torch.cat(
|
808 |
+
[prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
|
809 |
+
)
|
810 |
+
# During incremental decoding, as the padding token enters and
|
811 |
+
# leaves the frame, there will be a time when prev or current
|
812 |
+
# is None
|
813 |
+
elif prev_key_padding_mask is not None:
|
814 |
+
if src_len > prev_key_padding_mask.size(1):
|
815 |
+
filler = torch.zeros(
|
816 |
+
(batch_size, src_len - prev_key_padding_mask.size(1)),
|
817 |
+
device=prev_key_padding_mask.device,
|
818 |
+
)
|
819 |
+
new_key_padding_mask = torch.cat(
|
820 |
+
[prev_key_padding_mask.float(), filler.float()], dim=1
|
821 |
+
)
|
822 |
+
else:
|
823 |
+
new_key_padding_mask = prev_key_padding_mask.float()
|
824 |
+
elif key_padding_mask is not None:
|
825 |
+
if src_len > key_padding_mask.size(1):
|
826 |
+
filler = torch.zeros(
|
827 |
+
(batch_size, src_len - key_padding_mask.size(1)),
|
828 |
+
device=key_padding_mask.device,
|
829 |
+
)
|
830 |
+
new_key_padding_mask = torch.cat(
|
831 |
+
[filler.float(), key_padding_mask.float()], dim=1
|
832 |
+
)
|
833 |
+
else:
|
834 |
+
new_key_padding_mask = key_padding_mask.float()
|
835 |
+
else:
|
836 |
+
new_key_padding_mask = prev_key_padding_mask
|
837 |
+
return new_key_padding_mask
|
838 |
+
|
839 |
+
def _get_input_buffer(
|
840 |
+
self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
|
841 |
+
) -> Dict[str, Optional[Tensor]]:
|
842 |
+
result = self.get_incremental_state(incremental_state, "attn_state")
|
843 |
+
if result is not None:
|
844 |
+
return result
|
845 |
+
else:
|
846 |
+
empty_result: Dict[str, Optional[Tensor]] = {}
|
847 |
+
return empty_result
|
848 |
+
|
849 |
+
def _set_input_buffer(
|
850 |
+
self,
|
851 |
+
incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
|
852 |
+
buffer: Dict[str, Optional[Tensor]],
|
853 |
+
):
|
854 |
+
return self.set_incremental_state(incremental_state, "attn_state", buffer)
|
855 |
+
|
856 |
+
def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
|
857 |
+
return attn_weights
|
858 |
+
|
859 |
+
|
860 |
+
def compute_mask_indices(
|
861 |
+
shape: Tuple[int, int],
|
862 |
+
padding_mask: Optional[torch.Tensor],
|
863 |
+
mask_prob: float,
|
864 |
+
mask_length: int,
|
865 |
+
mask_type: str = "static",
|
866 |
+
mask_other: float = 0.0,
|
867 |
+
min_masks: int = 0,
|
868 |
+
no_overlap: bool = False,
|
869 |
+
min_space: int = 0,
|
870 |
+
) -> np.ndarray:
|
871 |
+
"""
|
872 |
+
Computes random mask spans for a given shape
|
873 |
+
|
874 |
+
Args:
|
875 |
+
shape: the the shape for which to compute masks.
|
876 |
+
should be of size 2 where first element is batch size and 2nd is timesteps
|
877 |
+
padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
|
878 |
+
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
879 |
+
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
880 |
+
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
881 |
+
mask_type: how to compute mask lengths
|
882 |
+
static = fixed size
|
883 |
+
uniform = sample from uniform distribution [mask_other, mask_length*2]
|
884 |
+
normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
|
885 |
+
poisson = sample from possion distribution with lambda = mask length
|
886 |
+
min_masks: minimum number of masked spans
|
887 |
+
no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
|
888 |
+
min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
|
889 |
+
"""
|
890 |
+
|
891 |
+
bsz, all_sz = shape
|
892 |
+
mask = np.full((bsz, all_sz), False)
|
893 |
+
|
894 |
+
all_num_mask = int(
|
895 |
+
# add a random number for probabilistic rounding
|
896 |
+
mask_prob * all_sz / float(mask_length) + np.random.rand()
|
897 |
+
)
|
898 |
+
|
899 |
+
all_num_mask = max(min_masks, all_num_mask)
|
900 |
+
|
901 |
+
mask_idcs = []
|
902 |
+
for i in range(bsz):
|
903 |
+
if padding_mask is not None:
|
904 |
+
sz = all_sz - padding_mask[i].long().sum().item()
|
905 |
+
num_mask = int(
|
906 |
+
# add a random number for probabilistic rounding
|
907 |
+
mask_prob * sz / float(mask_length) + np.random.rand()
|
908 |
+
)
|
909 |
+
num_mask = max(min_masks, num_mask)
|
910 |
+
else:
|
911 |
+
sz = all_sz
|
912 |
+
num_mask = all_num_mask
|
913 |
+
|
914 |
+
if mask_type == "static":
|
915 |
+
lengths = np.full(num_mask, mask_length)
|
916 |
+
elif mask_type == "uniform":
|
917 |
+
lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
|
918 |
+
elif mask_type == "normal":
|
919 |
+
lengths = np.random.normal(mask_length, mask_other, size=num_mask)
|
920 |
+
lengths = [max(1, int(round(x))) for x in lengths]
|
921 |
+
elif mask_type == "poisson":
|
922 |
+
lengths = np.random.poisson(mask_length, size=num_mask)
|
923 |
+
lengths = [int(round(x)) for x in lengths]
|
924 |
+
else:
|
925 |
+
raise Exception("unknown mask selection " + mask_type)
|
926 |
+
|
927 |
+
if sum(lengths) == 0:
|
928 |
+
lengths[0] = min(mask_length, sz - 1)
|
929 |
+
|
930 |
+
if no_overlap:
|
931 |
+
mask_idc = []
|
932 |
+
|
933 |
+
def arrange(s, e, length, keep_length):
|
934 |
+
span_start = np.random.randint(s, e - length)
|
935 |
+
mask_idc.extend(span_start + i for i in range(length))
|
936 |
+
|
937 |
+
new_parts = []
|
938 |
+
if span_start - s - min_space >= keep_length:
|
939 |
+
new_parts.append((s, span_start - min_space + 1))
|
940 |
+
if e - span_start - keep_length - min_space > keep_length:
|
941 |
+
new_parts.append((span_start + length + min_space, e))
|
942 |
+
return new_parts
|
943 |
+
|
944 |
+
parts = [(0, sz)]
|
945 |
+
min_length = min(lengths)
|
946 |
+
for length in sorted(lengths, reverse=True):
|
947 |
+
lens = np.fromiter(
|
948 |
+
(e - s if e - s >= length + min_space else 0 for s, e in parts),
|
949 |
+
np.int,
|
950 |
+
)
|
951 |
+
l_sum = np.sum(lens)
|
952 |
+
if l_sum == 0:
|
953 |
+
break
|
954 |
+
probs = lens / np.sum(lens)
|
955 |
+
c = np.random.choice(len(parts), p=probs)
|
956 |
+
s, e = parts.pop(c)
|
957 |
+
parts.extend(arrange(s, e, length, min_length))
|
958 |
+
mask_idc = np.asarray(mask_idc)
|
959 |
+
else:
|
960 |
+
min_len = min(lengths)
|
961 |
+
if sz - min_len <= num_mask:
|
962 |
+
min_len = sz - num_mask - 1
|
963 |
+
|
964 |
+
mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
|
965 |
+
|
966 |
+
mask_idc = np.asarray(
|
967 |
+
[
|
968 |
+
mask_idc[j] + offset
|
969 |
+
for j in range(len(mask_idc))
|
970 |
+
for offset in range(lengths[j])
|
971 |
+
]
|
972 |
+
)
|
973 |
+
|
974 |
+
mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
|
975 |
+
|
976 |
+
min_len = min([len(m) for m in mask_idcs])
|
977 |
+
for i, mask_idc in enumerate(mask_idcs):
|
978 |
+
if len(mask_idc) > min_len:
|
979 |
+
mask_idc = np.random.choice(mask_idc, min_len, replace=False)
|
980 |
+
mask[i, mask_idc] = True
|
981 |
+
|
982 |
+
return mask
|
983 |
+
|
984 |
+
|
985 |
+
class WavLMConfig:
|
986 |
+
def __init__(self, cfg=None):
|
987 |
+
self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
|
988 |
+
self.encoder_layers: int = 12 # num encoder layers in the transformer
|
989 |
+
|
990 |
+
self.encoder_embed_dim: int = 768 # encoder embedding dimension
|
991 |
+
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
|
992 |
+
self.encoder_attention_heads: int = 12 # num encoder attention heads
|
993 |
+
self.activation_fn: str = "gelu" # activation function to use
|
994 |
+
|
995 |
+
self.layer_norm_first: bool = False # apply layernorm first in the transformer
|
996 |
+
self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
|
997 |
+
self.conv_bias: bool = False # include bias in conv encoder
|
998 |
+
self.feature_grad_mult: float = (
|
999 |
+
1.0 # multiply feature extractor var grads by this
|
1000 |
+
)
|
1001 |
+
|
1002 |
+
self.normalize: bool = (
|
1003 |
+
False # normalize input to have 0 mean and unit variance during training
|
1004 |
+
)
|
1005 |
+
|
1006 |
+
# dropouts
|
1007 |
+
self.dropout: float = 0.1 # dropout probability for the transformer
|
1008 |
+
self.attention_dropout: float = 0.1 # dropout probability for attention weights
|
1009 |
+
self.activation_dropout: float = (
|
1010 |
+
0.0 # dropout probability after activation in FFN
|
1011 |
+
)
|
1012 |
+
self.encoder_layerdrop: float = (
|
1013 |
+
0.0 # probability of dropping a tarnsformer layer
|
1014 |
+
)
|
1015 |
+
self.dropout_input: float = (
|
1016 |
+
0.0 # dropout to apply to the input (after feat extr)
|
1017 |
+
)
|
1018 |
+
self.dropout_features: float = (
|
1019 |
+
0.0 # dropout to apply to the features (after feat extr)
|
1020 |
+
)
|
1021 |
+
|
1022 |
+
# masking
|
1023 |
+
self.mask_length: int = 10 # mask length
|
1024 |
+
self.mask_prob: float = 0.65 # probability of replacing a token with mask
|
1025 |
+
self.mask_selection: str = "static" # how to choose mask length
|
1026 |
+
self.mask_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
|
1027 |
+
self.no_mask_overlap: bool = False # whether to allow masks to overlap
|
1028 |
+
self.mask_min_space: int = (
|
1029 |
+
1 # min space between spans (if no overlap is enabled)
|
1030 |
+
)
|
1031 |
+
|
1032 |
+
# channel masking
|
1033 |
+
self.mask_channel_length: int = 10 # length of the mask for features (channels)
|
1034 |
+
self.mask_channel_prob: float = 0.0 # probability of replacing a feature with 0
|
1035 |
+
self.mask_channel_selection: str = (
|
1036 |
+
"static" # how to choose mask length for channel masking
|
1037 |
+
)
|
1038 |
+
self.mask_channel_other: float = 0 # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
|
1039 |
+
self.no_mask_channel_overlap: bool = (
|
1040 |
+
False # whether to allow channel masks to overlap
|
1041 |
+
)
|
1042 |
+
self.mask_channel_min_space: int = (
|
1043 |
+
1 # min space between spans (if no overlap is enabled)
|
1044 |
+
)
|
1045 |
+
|
1046 |
+
# positional embeddings
|
1047 |
+
self.conv_pos: int = (
|
1048 |
+
128 # number of filters for convolutional positional embeddings
|
1049 |
+
)
|
1050 |
+
self.conv_pos_groups: int = (
|
1051 |
+
16 # number of groups for convolutional positional embedding
|
1052 |
+
)
|
1053 |
+
|
1054 |
+
# relative position embedding
|
1055 |
+
self.relative_position_embedding: bool = (
|
1056 |
+
False # apply relative position embedding
|
1057 |
+
)
|
1058 |
+
self.num_buckets: int = 320 # number of buckets for relative position embedding
|
1059 |
+
self.max_distance: int = (
|
1060 |
+
1280 # maximum distance for relative position embedding
|
1061 |
+
)
|
1062 |
+
self.gru_rel_pos: bool = False # apply gated relative position embedding
|
1063 |
+
|
1064 |
+
if cfg is not None:
|
1065 |
+
self.update(cfg)
|
1066 |
+
|
1067 |
+
def update(self, cfg: dict):
|
1068 |
+
self.__dict__.update(cfg)
|
1069 |
+
|
1070 |
+
|
1071 |
+
class WavLM(nn.Module):
|
1072 |
+
def __init__(
|
1073 |
+
self,
|
1074 |
+
cfg: WavLMConfig,
|
1075 |
+
) -> None:
|
1076 |
+
super().__init__()
|
1077 |
+
logger.info(f"WavLM Config: {cfg.__dict__}")
|
1078 |
+
|
1079 |
+
self.cfg = cfg
|
1080 |
+
feature_enc_layers = eval(cfg.conv_feature_layers)
|
1081 |
+
self.embed = feature_enc_layers[-1][0]
|
1082 |
+
|
1083 |
+
self.feature_extractor = ConvFeatureExtractionModel(
|
1084 |
+
conv_layers=feature_enc_layers,
|
1085 |
+
dropout=0.0,
|
1086 |
+
mode=cfg.extractor_mode,
|
1087 |
+
conv_bias=cfg.conv_bias,
|
1088 |
+
)
|
1089 |
+
|
1090 |
+
self.post_extract_proj = (
|
1091 |
+
nn.Linear(self.embed, cfg.encoder_embed_dim)
|
1092 |
+
if self.embed != cfg.encoder_embed_dim
|
1093 |
+
else None
|
1094 |
+
)
|
1095 |
+
|
1096 |
+
self.mask_prob = cfg.mask_prob
|
1097 |
+
self.mask_selection = cfg.mask_selection
|
1098 |
+
self.mask_other = cfg.mask_other
|
1099 |
+
self.mask_length = cfg.mask_length
|
1100 |
+
self.no_mask_overlap = cfg.no_mask_overlap
|
1101 |
+
self.mask_min_space = cfg.mask_min_space
|
1102 |
+
|
1103 |
+
self.mask_channel_prob = cfg.mask_channel_prob
|
1104 |
+
self.mask_channel_selection = cfg.mask_channel_selection
|
1105 |
+
self.mask_channel_other = cfg.mask_channel_other
|
1106 |
+
self.mask_channel_length = cfg.mask_channel_length
|
1107 |
+
self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
|
1108 |
+
self.mask_channel_min_space = cfg.mask_channel_min_space
|
1109 |
+
|
1110 |
+
self.dropout_input = nn.Dropout(cfg.dropout_input)
|
1111 |
+
self.dropout_features = nn.Dropout(cfg.dropout_features)
|
1112 |
+
|
1113 |
+
self.feature_grad_mult = cfg.feature_grad_mult
|
1114 |
+
|
1115 |
+
self.mask_emb = nn.Parameter(
|
1116 |
+
torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
|
1117 |
+
)
|
1118 |
+
|
1119 |
+
self.encoder = TransformerEncoder(cfg)
|
1120 |
+
self.layer_norm = LayerNorm(self.embed)
|
1121 |
+
|
1122 |
+
def apply_mask(self, x, padding_mask):
|
1123 |
+
B, T, C = x.shape
|
1124 |
+
if self.mask_prob > 0:
|
1125 |
+
mask_indices = compute_mask_indices(
|
1126 |
+
(B, T),
|
1127 |
+
padding_mask,
|
1128 |
+
self.mask_prob,
|
1129 |
+
self.mask_length,
|
1130 |
+
self.mask_selection,
|
1131 |
+
self.mask_other,
|
1132 |
+
min_masks=2,
|
1133 |
+
no_overlap=self.no_mask_overlap,
|
1134 |
+
min_space=self.mask_min_space,
|
1135 |
+
)
|
1136 |
+
mask_indices = torch.from_numpy(mask_indices).to(x.device)
|
1137 |
+
x[mask_indices] = self.mask_emb
|
1138 |
+
else:
|
1139 |
+
mask_indices = None
|
1140 |
+
|
1141 |
+
if self.mask_channel_prob > 0:
|
1142 |
+
mask_channel_indices = compute_mask_indices(
|
1143 |
+
(B, C),
|
1144 |
+
None,
|
1145 |
+
self.mask_channel_prob,
|
1146 |
+
self.mask_channel_length,
|
1147 |
+
self.mask_channel_selection,
|
1148 |
+
self.mask_channel_other,
|
1149 |
+
no_overlap=self.no_mask_channel_overlap,
|
1150 |
+
min_space=self.mask_channel_min_space,
|
1151 |
+
)
|
1152 |
+
mask_channel_indices = (
|
1153 |
+
torch.from_numpy(mask_channel_indices)
|
1154 |
+
.to(x.device)
|
1155 |
+
.unsqueeze(1)
|
1156 |
+
.expand(-1, T, -1)
|
1157 |
+
)
|
1158 |
+
x[mask_channel_indices] = 0
|
1159 |
+
|
1160 |
+
return x, mask_indices
|
1161 |
+
|
1162 |
+
def forward_padding_mask(
|
1163 |
+
self,
|
1164 |
+
features: torch.Tensor,
|
1165 |
+
padding_mask: torch.Tensor,
|
1166 |
+
) -> torch.Tensor:
|
1167 |
+
extra = padding_mask.size(1) % features.size(1)
|
1168 |
+
if extra > 0:
|
1169 |
+
padding_mask = padding_mask[:, :-extra]
|
1170 |
+
padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
|
1171 |
+
padding_mask = padding_mask.all(-1)
|
1172 |
+
return padding_mask
|
1173 |
+
|
1174 |
+
def extract_features(
|
1175 |
+
self,
|
1176 |
+
source: torch.Tensor,
|
1177 |
+
padding_mask: Optional[torch.Tensor] = None,
|
1178 |
+
mask: bool = False,
|
1179 |
+
ret_conv: bool = False,
|
1180 |
+
output_layer: Optional[int] = None,
|
1181 |
+
ret_layer_results: bool = False,
|
1182 |
+
):
|
1183 |
+
if self.feature_grad_mult > 0:
|
1184 |
+
features = self.feature_extractor(source)
|
1185 |
+
if self.feature_grad_mult != 1.0:
|
1186 |
+
features = GradMultiply.apply(features, self.feature_grad_mult)
|
1187 |
+
else:
|
1188 |
+
with torch.no_grad():
|
1189 |
+
features = self.feature_extractor(source)
|
1190 |
+
|
1191 |
+
features = features.transpose(1, 2)
|
1192 |
+
features = self.layer_norm(features)
|
1193 |
+
|
1194 |
+
if padding_mask is not None:
|
1195 |
+
padding_mask = self.forward_padding_mask(features, padding_mask)
|
1196 |
+
|
1197 |
+
if self.post_extract_proj is not None:
|
1198 |
+
features = self.post_extract_proj(features)
|
1199 |
+
|
1200 |
+
features = self.dropout_input(features)
|
1201 |
+
|
1202 |
+
if mask:
|
1203 |
+
x, mask_indices = self.apply_mask(features, padding_mask)
|
1204 |
+
else:
|
1205 |
+
x = features
|
1206 |
+
|
1207 |
+
# feature: (B, T, D), float
|
1208 |
+
# target: (B, T), long
|
1209 |
+
# x: (B, T, D), float
|
1210 |
+
# padding_mask: (B, T), bool
|
1211 |
+
# mask_indices: (B, T), bool
|
1212 |
+
x, layer_results = self.encoder(
|
1213 |
+
x,
|
1214 |
+
padding_mask=padding_mask,
|
1215 |
+
layer=None if output_layer is None else output_layer - 1,
|
1216 |
+
)
|
1217 |
+
|
1218 |
+
res = {
|
1219 |
+
"x": x,
|
1220 |
+
"padding_mask": padding_mask,
|
1221 |
+
"features": features,
|
1222 |
+
"layer_results": layer_results,
|
1223 |
+
}
|
1224 |
+
|
1225 |
+
feature = res["features"] if ret_conv else res["x"]
|
1226 |
+
if ret_layer_results:
|
1227 |
+
feature = (feature, res["layer_results"])
|
1228 |
+
return feature, res["padding_mask"]
|
1229 |
+
|
1230 |
+
|
1231 |
+
class ConvFeatureExtractionModel(nn.Module):
|
1232 |
+
def __init__(
|
1233 |
+
self,
|
1234 |
+
conv_layers: List[Tuple[int, int, int]],
|
1235 |
+
dropout: float = 0.0,
|
1236 |
+
mode: str = "default",
|
1237 |
+
conv_bias: bool = False,
|
1238 |
+
conv_type: str = "default",
|
1239 |
+
):
|
1240 |
+
super().__init__()
|
1241 |
+
|
1242 |
+
assert mode in {"default", "layer_norm"}
|
1243 |
+
|
1244 |
+
def block(
|
1245 |
+
n_in,
|
1246 |
+
n_out,
|
1247 |
+
k,
|
1248 |
+
stride,
|
1249 |
+
is_layer_norm=False,
|
1250 |
+
is_group_norm=False,
|
1251 |
+
conv_bias=False,
|
1252 |
+
):
|
1253 |
+
def make_conv():
|
1254 |
+
conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
|
1255 |
+
nn.init.kaiming_normal_(conv.weight)
|
1256 |
+
return conv
|
1257 |
+
|
1258 |
+
assert (
|
1259 |
+
is_layer_norm and is_group_norm
|
1260 |
+
) is False, "layer norm and group norm are exclusive"
|
1261 |
+
|
1262 |
+
if is_layer_norm:
|
1263 |
+
return nn.Sequential(
|
1264 |
+
make_conv(),
|
1265 |
+
nn.Dropout(p=dropout),
|
1266 |
+
nn.Sequential(
|
1267 |
+
TransposeLast(),
|
1268 |
+
Fp32LayerNorm(dim, elementwise_affine=True),
|
1269 |
+
TransposeLast(),
|
1270 |
+
),
|
1271 |
+
nn.GELU(),
|
1272 |
+
)
|
1273 |
+
elif is_group_norm:
|
1274 |
+
return nn.Sequential(
|
1275 |
+
make_conv(),
|
1276 |
+
nn.Dropout(p=dropout),
|
1277 |
+
Fp32GroupNorm(dim, dim, affine=True),
|
1278 |
+
nn.GELU(),
|
1279 |
+
)
|
1280 |
+
else:
|
1281 |
+
return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
|
1282 |
+
|
1283 |
+
self.conv_type = conv_type
|
1284 |
+
if self.conv_type == "default":
|
1285 |
+
in_d = 1
|
1286 |
+
self.conv_layers = nn.ModuleList()
|
1287 |
+
for i, cl in enumerate(conv_layers):
|
1288 |
+
assert len(cl) == 3, "invalid conv definition: " + str(cl)
|
1289 |
+
(dim, k, stride) = cl
|
1290 |
+
|
1291 |
+
self.conv_layers.append(
|
1292 |
+
block(
|
1293 |
+
in_d,
|
1294 |
+
dim,
|
1295 |
+
k,
|
1296 |
+
stride,
|
1297 |
+
is_layer_norm=mode == "layer_norm",
|
1298 |
+
is_group_norm=mode == "default" and i == 0,
|
1299 |
+
conv_bias=conv_bias,
|
1300 |
+
)
|
1301 |
+
)
|
1302 |
+
in_d = dim
|
1303 |
+
elif self.conv_type == "conv2d":
|
1304 |
+
in_d = 1
|
1305 |
+
self.conv_layers = nn.ModuleList()
|
1306 |
+
for i, cl in enumerate(conv_layers):
|
1307 |
+
assert len(cl) == 3
|
1308 |
+
(dim, k, stride) = cl
|
1309 |
+
|
1310 |
+
self.conv_layers.append(torch.nn.Conv2d(in_d, dim, k, stride))
|
1311 |
+
self.conv_layers.append(torch.nn.ReLU())
|
1312 |
+
in_d = dim
|
1313 |
+
elif self.conv_type == "custom":
|
1314 |
+
in_d = 1
|
1315 |
+
idim = 80
|
1316 |
+
self.conv_layers = nn.ModuleList()
|
1317 |
+
for i, cl in enumerate(conv_layers):
|
1318 |
+
assert len(cl) == 3
|
1319 |
+
(dim, k, stride) = cl
|
1320 |
+
self.conv_layers.append(
|
1321 |
+
torch.nn.Conv2d(in_d, dim, k, stride, padding=1)
|
1322 |
+
)
|
1323 |
+
self.conv_layers.append(torch.nn.LayerNorm([dim, idim]))
|
1324 |
+
self.conv_layers.append(torch.nn.ReLU())
|
1325 |
+
in_d = dim
|
1326 |
+
if (i + 1) % 2 == 0:
|
1327 |
+
self.conv_layers.append(
|
1328 |
+
torch.nn.MaxPool2d(2, stride=2, ceil_mode=True)
|
1329 |
+
)
|
1330 |
+
idim = int(math.ceil(idim / 2))
|
1331 |
+
else:
|
1332 |
+
pass
|
1333 |
+
|
1334 |
+
def forward(self, x, mask=None):
|
1335 |
+
# BxT -> BxCxT
|
1336 |
+
x = x.unsqueeze(1)
|
1337 |
+
if self.conv_type == "custom":
|
1338 |
+
for conv in self.conv_layers:
|
1339 |
+
if isinstance(conv, nn.LayerNorm):
|
1340 |
+
x = x.transpose(1, 2)
|
1341 |
+
x = conv(x).transpose(1, 2)
|
1342 |
+
else:
|
1343 |
+
x = conv(x)
|
1344 |
+
x = x.transpose(2, 3).contiguous()
|
1345 |
+
x = x.view(x.size(0), -1, x.size(-1))
|
1346 |
+
else:
|
1347 |
+
for conv in self.conv_layers:
|
1348 |
+
x = conv(x)
|
1349 |
+
if self.conv_type == "conv2d":
|
1350 |
+
b, c, t, f = x.size()
|
1351 |
+
x = x.transpose(2, 3).contiguous().view(b, c * f, t)
|
1352 |
+
return x
|
1353 |
+
|
1354 |
+
|
1355 |
+
class TransformerEncoder(nn.Module):
|
1356 |
+
def __init__(self, args):
|
1357 |
+
super().__init__()
|
1358 |
+
|
1359 |
+
self.dropout = args.dropout
|
1360 |
+
self.embedding_dim = args.encoder_embed_dim
|
1361 |
+
|
1362 |
+
self.pos_conv = nn.Conv1d(
|
1363 |
+
self.embedding_dim,
|
1364 |
+
self.embedding_dim,
|
1365 |
+
kernel_size=args.conv_pos,
|
1366 |
+
padding=args.conv_pos // 2,
|
1367 |
+
groups=args.conv_pos_groups,
|
1368 |
+
)
|
1369 |
+
dropout = 0
|
1370 |
+
std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
|
1371 |
+
nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
|
1372 |
+
nn.init.constant_(self.pos_conv.bias, 0)
|
1373 |
+
|
1374 |
+
self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
|
1375 |
+
self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
|
1376 |
+
|
1377 |
+
if hasattr(args, "relative_position_embedding"):
|
1378 |
+
self.relative_position_embedding = args.relative_position_embedding
|
1379 |
+
self.num_buckets = args.num_buckets
|
1380 |
+
self.max_distance = args.max_distance
|
1381 |
+
else:
|
1382 |
+
self.relative_position_embedding = False
|
1383 |
+
self.num_buckets = 0
|
1384 |
+
self.max_distance = 0
|
1385 |
+
|
1386 |
+
self.layers = nn.ModuleList(
|
1387 |
+
[
|
1388 |
+
TransformerSentenceEncoderLayer(
|
1389 |
+
embedding_dim=self.embedding_dim,
|
1390 |
+
ffn_embedding_dim=args.encoder_ffn_embed_dim,
|
1391 |
+
num_attention_heads=args.encoder_attention_heads,
|
1392 |
+
dropout=self.dropout,
|
1393 |
+
attention_dropout=args.attention_dropout,
|
1394 |
+
activation_dropout=args.activation_dropout,
|
1395 |
+
activation_fn=args.activation_fn,
|
1396 |
+
layer_norm_first=args.layer_norm_first,
|
1397 |
+
has_relative_attention_bias=(
|
1398 |
+
self.relative_position_embedding and i == 0
|
1399 |
+
),
|
1400 |
+
num_buckets=self.num_buckets,
|
1401 |
+
max_distance=self.max_distance,
|
1402 |
+
gru_rel_pos=args.gru_rel_pos,
|
1403 |
+
)
|
1404 |
+
for i in range(args.encoder_layers)
|
1405 |
+
]
|
1406 |
+
)
|
1407 |
+
|
1408 |
+
self.layer_norm_first = args.layer_norm_first
|
1409 |
+
self.layer_norm = LayerNorm(self.embedding_dim)
|
1410 |
+
self.layerdrop = args.encoder_layerdrop
|
1411 |
+
|
1412 |
+
self.apply(init_bert_params)
|
1413 |
+
|
1414 |
+
def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
|
1415 |
+
x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer)
|
1416 |
+
|
1417 |
+
if self.layer_norm_first and layer is None:
|
1418 |
+
x = self.layer_norm(x)
|
1419 |
+
|
1420 |
+
return x, layer_results
|
1421 |
+
|
1422 |
+
def extract_features(
|
1423 |
+
self, x, padding_mask=None, streaming_mask=None, tgt_layer=None
|
1424 |
+
):
|
1425 |
+
if padding_mask is not None:
|
1426 |
+
x[padding_mask] = 0
|
1427 |
+
|
1428 |
+
x_conv = self.pos_conv(x.transpose(1, 2))
|
1429 |
+
x_conv = x_conv.transpose(1, 2)
|
1430 |
+
x = x + x_conv
|
1431 |
+
|
1432 |
+
if not self.layer_norm_first:
|
1433 |
+
x = self.layer_norm(x)
|
1434 |
+
|
1435 |
+
x = F.dropout(x, p=self.dropout, training=self.training)
|
1436 |
+
|
1437 |
+
# B x T x C -> T x B x C
|
1438 |
+
x = x.transpose(0, 1)
|
1439 |
+
|
1440 |
+
layer_results = []
|
1441 |
+
z = None
|
1442 |
+
if tgt_layer is not None:
|
1443 |
+
layer_results.append((x, z))
|
1444 |
+
r = None
|
1445 |
+
pos_bias = None
|
1446 |
+
for i, layer in enumerate(self.layers):
|
1447 |
+
dropout_probability = np.random.random()
|
1448 |
+
if not self.training or (dropout_probability > self.layerdrop):
|
1449 |
+
x, z, pos_bias = layer(
|
1450 |
+
x,
|
1451 |
+
self_attn_padding_mask=padding_mask,
|
1452 |
+
need_weights=False,
|
1453 |
+
self_attn_mask=streaming_mask,
|
1454 |
+
pos_bias=pos_bias,
|
1455 |
+
)
|
1456 |
+
if tgt_layer is not None:
|
1457 |
+
layer_results.append((x, z))
|
1458 |
+
if i == tgt_layer:
|
1459 |
+
r = x
|
1460 |
+
break
|
1461 |
+
|
1462 |
+
if r is not None:
|
1463 |
+
x = r
|
1464 |
+
|
1465 |
+
# T x B x C -> B x T x C
|
1466 |
+
x = x.transpose(0, 1)
|
1467 |
+
|
1468 |
+
return x, layer_results
|
1469 |
+
|
1470 |
+
|
1471 |
+
class TransformerSentenceEncoderLayer(nn.Module):
|
1472 |
+
"""
|
1473 |
+
Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
|
1474 |
+
models.
|
1475 |
+
"""
|
1476 |
+
|
1477 |
+
def __init__(
|
1478 |
+
self,
|
1479 |
+
embedding_dim: float = 768,
|
1480 |
+
ffn_embedding_dim: float = 3072,
|
1481 |
+
num_attention_heads: float = 8,
|
1482 |
+
dropout: float = 0.1,
|
1483 |
+
attention_dropout: float = 0.1,
|
1484 |
+
activation_dropout: float = 0.1,
|
1485 |
+
activation_fn: str = "relu",
|
1486 |
+
layer_norm_first: bool = False,
|
1487 |
+
has_relative_attention_bias: bool = False,
|
1488 |
+
num_buckets: int = 0,
|
1489 |
+
max_distance: int = 0,
|
1490 |
+
rescale_init: bool = False,
|
1491 |
+
gru_rel_pos: bool = False,
|
1492 |
+
) -> None:
|
1493 |
+
super().__init__()
|
1494 |
+
# Initialize parameters
|
1495 |
+
self.embedding_dim = embedding_dim
|
1496 |
+
self.dropout = dropout
|
1497 |
+
self.activation_dropout = activation_dropout
|
1498 |
+
|
1499 |
+
# Initialize blocks
|
1500 |
+
self.activation_name = activation_fn
|
1501 |
+
self.activation_fn = get_activation_fn(activation_fn)
|
1502 |
+
self.self_attn = MultiheadAttention(
|
1503 |
+
self.embedding_dim,
|
1504 |
+
num_attention_heads,
|
1505 |
+
dropout=attention_dropout,
|
1506 |
+
self_attention=True,
|
1507 |
+
has_relative_attention_bias=has_relative_attention_bias,
|
1508 |
+
num_buckets=num_buckets,
|
1509 |
+
max_distance=max_distance,
|
1510 |
+
rescale_init=rescale_init,
|
1511 |
+
gru_rel_pos=gru_rel_pos,
|
1512 |
+
)
|
1513 |
+
|
1514 |
+
self.dropout1 = nn.Dropout(dropout)
|
1515 |
+
self.dropout2 = nn.Dropout(self.activation_dropout)
|
1516 |
+
self.dropout3 = nn.Dropout(dropout)
|
1517 |
+
|
1518 |
+
self.layer_norm_first = layer_norm_first
|
1519 |
+
|
1520 |
+
# layer norm associated with the self attention layer
|
1521 |
+
self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
|
1522 |
+
|
1523 |
+
if self.activation_name == "glu":
|
1524 |
+
self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
|
1525 |
+
else:
|
1526 |
+
self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
|
1527 |
+
self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
|
1528 |
+
|
1529 |
+
# layer norm associated with the position wise feed-forward NN
|
1530 |
+
self.final_layer_norm = LayerNorm(self.embedding_dim)
|
1531 |
+
|
1532 |
+
def forward(
|
1533 |
+
self,
|
1534 |
+
x: torch.Tensor,
|
1535 |
+
self_attn_mask: torch.Tensor = None,
|
1536 |
+
self_attn_padding_mask: torch.Tensor = None,
|
1537 |
+
need_weights: bool = False,
|
1538 |
+
pos_bias=None,
|
1539 |
+
):
|
1540 |
+
"""
|
1541 |
+
LayerNorm is applied either before or after the self-attention/ffn
|
1542 |
+
modules similar to the original Transformer imlementation.
|
1543 |
+
"""
|
1544 |
+
residual = x
|
1545 |
+
|
1546 |
+
if self.layer_norm_first:
|
1547 |
+
x = self.self_attn_layer_norm(x)
|
1548 |
+
x, attn, pos_bias = self.self_attn(
|
1549 |
+
query=x,
|
1550 |
+
key=x,
|
1551 |
+
value=x,
|
1552 |
+
key_padding_mask=self_attn_padding_mask,
|
1553 |
+
need_weights=False,
|
1554 |
+
attn_mask=self_attn_mask,
|
1555 |
+
position_bias=pos_bias,
|
1556 |
+
)
|
1557 |
+
x = self.dropout1(x)
|
1558 |
+
x = residual + x
|
1559 |
+
|
1560 |
+
residual = x
|
1561 |
+
x = self.final_layer_norm(x)
|
1562 |
+
if self.activation_name == "glu":
|
1563 |
+
x = self.fc1(x)
|
1564 |
+
else:
|
1565 |
+
x = self.activation_fn(self.fc1(x))
|
1566 |
+
x = self.dropout2(x)
|
1567 |
+
x = self.fc2(x)
|
1568 |
+
x = self.dropout3(x)
|
1569 |
+
x = residual + x
|
1570 |
+
else:
|
1571 |
+
x, attn, pos_bias = self.self_attn(
|
1572 |
+
query=x,
|
1573 |
+
key=x,
|
1574 |
+
value=x,
|
1575 |
+
key_padding_mask=self_attn_padding_mask,
|
1576 |
+
need_weights=need_weights,
|
1577 |
+
attn_mask=self_attn_mask,
|
1578 |
+
position_bias=pos_bias,
|
1579 |
+
)
|
1580 |
+
|
1581 |
+
x = self.dropout1(x)
|
1582 |
+
x = residual + x
|
1583 |
+
|
1584 |
+
x = self.self_attn_layer_norm(x)
|
1585 |
+
|
1586 |
+
residual = x
|
1587 |
+
if self.activation_name == "glu":
|
1588 |
+
x = self.fc1(x)
|
1589 |
+
else:
|
1590 |
+
x = self.activation_fn(self.fc1(x))
|
1591 |
+
x = self.dropout2(x)
|
1592 |
+
x = self.fc2(x)
|
1593 |
+
x = self.dropout3(x)
|
1594 |
+
x = residual + x
|
1595 |
+
x = self.final_layer_norm(x)
|
1596 |
+
|
1597 |
+
return x, attn, pos_bias
|