LAP-DEV commited on
Commit
710db5f
·
verified ·
1 Parent(s): b179c36

Upload 63 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +10 -0
  2. .github/FUNDING.yml +13 -0
  3. .github/ISSUE_TEMPLATE/bug_report.md +11 -0
  4. .github/ISSUE_TEMPLATE/feature_request.md +10 -0
  5. .github/ISSUE_TEMPLATE/hallucination.md +12 -0
  6. .github/pull_request_template.md +5 -0
  7. .github/workflows/ci-shell.yml +43 -0
  8. .github/workflows/ci.yml +41 -0
  9. .github/workflows/publish-docker.yml +37 -0
  10. .gitignore +13 -0
  11. Dockerfile +34 -0
  12. Install.bat +20 -0
  13. Install.sh +17 -0
  14. LICENSE +201 -0
  15. README.md +117 -12
  16. app.py +359 -0
  17. configs/default_parameters.yaml +64 -0
  18. demo/audio.wav +0 -0
  19. docker-compose.yaml +29 -0
  20. models/models will be saved here.txt +0 -0
  21. modules/__init__.py +0 -0
  22. modules/diarize/__init__.py +0 -0
  23. modules/diarize/audio_loader.py +179 -0
  24. modules/diarize/diarize_pipeline.py +95 -0
  25. modules/diarize/diarizer.py +133 -0
  26. modules/translation/__init__.py +0 -0
  27. modules/translation/deepl_api.py +226 -0
  28. modules/translation/nllb_inference.py +287 -0
  29. modules/translation/translation_base.py +177 -0
  30. modules/ui/__init__.py +0 -0
  31. modules/ui/htmls.py +97 -0
  32. modules/utils/__init__.py +0 -0
  33. modules/utils/cli_manager.py +12 -0
  34. modules/utils/files_manager.py +69 -0
  35. modules/utils/paths.py +31 -0
  36. modules/utils/subtitle_manager.py +132 -0
  37. modules/utils/youtube_manager.py +33 -0
  38. modules/uvr/music_separator.py +183 -0
  39. modules/vad/__init__.py +0 -0
  40. modules/vad/silero_vad.py +264 -0
  41. modules/whisper/__init__.py +0 -0
  42. modules/whisper/faster_whisper_inference.py +192 -0
  43. modules/whisper/insanely_fast_whisper_inference.py +195 -0
  44. modules/whisper/whisper_Inference.py +104 -0
  45. modules/whisper/whisper_base.py +542 -0
  46. modules/whisper/whisper_factory.py +90 -0
  47. modules/whisper/whisper_parameter.py +369 -0
  48. notebook/whisper-webui.ipynb +132 -0
  49. outputs/outputs are saved here.txt +0 -0
  50. outputs/translations/outputs for translation are saved here.txt +0 -0
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # from .gitignore
2
+ venv/
3
+ ui/__pycache__/
4
+ outputs/
5
+ modules/__pycache__/
6
+ models/
7
+ modules/yt_tmp.wav
8
+
9
+ .git
10
+ .github
.github/FUNDING.yml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # These are supported funding model platforms
2
+
3
+ github: []
4
+ patreon: # Replace with a single Patreon username
5
+ open_collective: # Replace with a single Open Collective username
6
+ ko_fi: jhj0517
7
+ tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8
+ community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9
+ liberapay: # Replace with a single Liberapay username
10
+ issuehunt: # Replace with a single IssueHunt username
11
+ otechie: # Replace with a single Otechie username
12
+ lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13
+ custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: bug
6
+ assignees: jhj0517
7
+
8
+ ---
9
+
10
+ **Which OS are you using?**
11
+ - OS: [e.g. iOS or Windows.. If you are using Google Colab, just Colab.]
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Any feature you want
4
+ title: ''
5
+ labels: enhancement
6
+ assignees: jhj0517
7
+
8
+ ---
9
+
10
+
.github/ISSUE_TEMPLATE/hallucination.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Hallucination
3
+ about: Whisper hallucinations. ( Repeating certain words or subtitles starting too
4
+ early, etc. )
5
+ title: ''
6
+ labels: hallucination
7
+ assignees: jhj0517
8
+
9
+ ---
10
+
11
+ **Download URL for sample audio**
12
+ - Please upload download URL for sample audio file so I can test with some settings for better result. You can use https://easyupload.io/ or any other service to share.
.github/pull_request_template.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ## Related issues
2
+ - #0
3
+
4
+ ## Changed
5
+ 1. Changes
.github/workflows/ci-shell.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI-Shell Script
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ push:
7
+ branches:
8
+ - master
9
+ pull_request:
10
+ branches:
11
+ - master
12
+
13
+ jobs:
14
+ test-shell-script:
15
+
16
+ runs-on: ubuntu-latest
17
+ strategy:
18
+ matrix:
19
+ python: [ "3.10" ]
20
+
21
+ steps:
22
+ - name: Clean up space for action
23
+ run: rm -rf /opt/hostedtoolcache
24
+
25
+ - uses: actions/checkout@v4
26
+ - name: Setup Python
27
+ uses: actions/setup-python@v5
28
+ with:
29
+ python-version: ${{ matrix.python }}
30
+
31
+ - name: Install git and ffmpeg
32
+ run: sudo apt-get update && sudo apt-get install -y git ffmpeg
33
+
34
+ - name: Execute Install.sh
35
+ run: |
36
+ chmod +x ./Install.sh
37
+ ./Install.sh
38
+
39
+ - name: Execute start-webui.sh
40
+ run: |
41
+ chmod +x ./start-webui.sh
42
+ timeout 60s ./start-webui.sh || true
43
+
.github/workflows/ci.yml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ push:
7
+ branches:
8
+ - master
9
+ pull_request:
10
+ branches:
11
+ - master
12
+
13
+ jobs:
14
+ build:
15
+
16
+ runs-on: ubuntu-latest
17
+ strategy:
18
+ matrix:
19
+ python: ["3.10"]
20
+
21
+ env:
22
+ DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
23
+
24
+ steps:
25
+ - name: Clean up space for action
26
+ run: rm -rf /opt/hostedtoolcache
27
+
28
+ - uses: actions/checkout@v4
29
+ - name: Setup Python
30
+ uses: actions/setup-python@v5
31
+ with:
32
+ python-version: ${{ matrix.python }}
33
+
34
+ - name: Install git and ffmpeg
35
+ run: sudo apt-get update && sudo apt-get install -y git ffmpeg
36
+
37
+ - name: Install dependencies
38
+ run: pip install -r requirements.txt pytest
39
+
40
+ - name: Run test
41
+ run: python -m pytest -rs tests
.github/workflows/publish-docker.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish to Docker Hub
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+
8
+ jobs:
9
+ build-and-push:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Log in to Docker Hub
14
+ uses: docker/login-action@v2
15
+ with:
16
+ username: ${{ secrets.DOCKER_USERNAME }}
17
+ password: ${{ secrets.DOCKER_PASSWORD }}
18
+
19
+ - name: Checkout repository
20
+ uses: actions/checkout@v3
21
+
22
+ - name: Set up Docker Buildx
23
+ uses: docker/setup-buildx-action@v3
24
+
25
+ - name: Set up QEMU
26
+ uses: docker/setup-qemu-action@v3
27
+
28
+ - name: Build and push Docker image
29
+ uses: docker/build-push-action@v5
30
+ with:
31
+ context: .
32
+ file: ./Dockerfile
33
+ push: true
34
+ tags: ${{ secrets.DOCKER_USERNAME }}/whisper-webui:latest
35
+
36
+ - name: Log out of Docker Hub
37
+ run: docker logout
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.wav
2
+ *.png
3
+ *.mp4
4
+ *.mp3
5
+ .idea/
6
+ .pytest_cache/
7
+ venv/
8
+ modules/ui/__pycache__/
9
+ outputs/
10
+ modules/__pycache__/
11
+ models/
12
+ modules/yt_tmp.wav
13
+ configs/default_parameters.yaml
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM debian:bookworm-slim AS builder
2
+
3
+ RUN apt-get update && \
4
+ apt-get install -y curl git python3 python3-pip python3-venv && \
5
+ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \
6
+ mkdir -p /Whisper-WebUI
7
+
8
+ WORKDIR /Whisper-WebUI
9
+
10
+ COPY requirements.txt .
11
+
12
+ RUN python3 -m venv venv && \
13
+ . venv/bin/activate && \
14
+ pip install --no-cache-dir -r requirements.txt
15
+
16
+
17
+ FROM debian:bookworm-slim AS runtime
18
+
19
+ RUN apt-get update && \
20
+ apt-get install -y curl ffmpeg python3 && \
21
+ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
22
+
23
+ WORKDIR /Whisper-WebUI
24
+
25
+ COPY . .
26
+ COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
27
+
28
+ VOLUME [ "/Whisper-WebUI/models" ]
29
+ VOLUME [ "/Whisper-WebUI/outputs" ]
30
+
31
+ ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
32
+ ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
33
+
34
+ ENTRYPOINT [ "python", "app.py" ]
Install.bat ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ if not exist "%~dp0\venv\Scripts" (
4
+ echo Creating venv...
5
+ python -m venv venv
6
+ )
7
+ echo checked the venv folder. now installing requirements..
8
+
9
+ call "%~dp0\venv\scripts\activate"
10
+
11
+ pip install -r requirements.txt
12
+
13
+ if errorlevel 1 (
14
+ echo.
15
+ echo Requirements installation failed. please remove venv folder and run install.bat again.
16
+ ) else (
17
+ echo.
18
+ echo Requirements installed successfully.
19
+ )
20
+ pause
Install.sh ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ if [ ! -d "venv" ]; then
4
+ echo "Creating virtual environment..."
5
+ python -m venv venv
6
+ fi
7
+
8
+ source venv/bin/activate
9
+
10
+ pip install -r requirements.txt && echo "Requirements installed successfully." || {
11
+ echo ""
12
+ echo "Requirements installation failed. Please remove the venv folder and run the script again."
13
+ deactivate
14
+ exit 1
15
+ }
16
+
17
+ deactivate
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2023 jhj0517
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,12 +1,117 @@
1
- ---
2
- title: Whisper WebUI
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.5.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Whisper-WebUI
2
+ A Gradio-based browser interface for [Whisper](https://github.com/openai/whisper). You can use it as an Easy Subtitle Generator!
3
+
4
+ ![Whisper WebUI](https://github.com/jhj0517/Whsiper-WebUI/blob/master/screenshot.png)
5
+
6
+ ## Notebook
7
+ If you wish to try this on Colab, you can do it in [here](https://colab.research.google.com/github/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)!
8
+
9
+ # Feature
10
+ - Select the Whisper implementation you want to use between :
11
+ - [openai/whisper](https://github.com/openai/whisper)
12
+ - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) (used by default)
13
+ - [Vaibhavs10/insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)
14
+ - Generate subtitles from various sources, including :
15
+ - Files
16
+ - Youtube
17
+ - Microphone
18
+ - Currently supported subtitle formats :
19
+ - SRT
20
+ - WebVTT
21
+ - txt ( only text file without timeline )
22
+ - Speech to Text Translation
23
+ - From other languages to English. ( This is Whisper's end-to-end speech-to-text translation feature )
24
+ - Text to Text Translation
25
+ - Translate subtitle files using Facebook NLLB models
26
+ - Translate subtitle files using DeepL API
27
+ - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
28
+ - Pre-processing audio input to separate BGM with [UVR](https://github.com/Anjok07/ultimatevocalremovergui), [UVR-api](https://github.com/NextAudioGen/ultimatevocalremover_api).
29
+ - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
30
+ - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
31
+ 1. https://huggingface.co/pyannote/speaker-diarization-3.1
32
+ 2. https://huggingface.co/pyannote/segmentation-3.0
33
+
34
+ # Installation and Running
35
+ ### Prerequisite
36
+ To run this WebUI, you need to have `git`, `python` version 3.8 ~ 3.10, `FFmpeg`. <br>
37
+ And if you're not using an Nvida GPU, or using a different `CUDA` version than 12.4, edit the [`requirements.txt`](https://github.com/jhj0517/Whisper-WebUI/blob/master/requirements.txt) to match your environment.
38
+
39
+ Please follow the links below to install the necessary software:
40
+ - git : [https://git-scm.com/downloads](https://git-scm.com/downloads)
41
+ - python : [https://www.python.org/downloads/](https://www.python.org/downloads/) **( If your python version is too new, torch will not install properly.)**
42
+ - FFmpeg : [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html)
43
+ - CUDA : [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
44
+
45
+ After installing FFmpeg, **make sure to add the `FFmpeg/bin` folder to your system PATH!**
46
+
47
+ ### Automatic Installation
48
+
49
+ 1. Download `Whisper-WebUI.zip` with the file corresponding to your OS from [v1.0.0](https://github.com/jhj0517/Whisper-WebUI/releases/tag/v1.0.0) and extract its contents.
50
+ 2. Run `install.bat` or `install.sh` to install dependencies. (This will create a `venv` directory and install dependencies there.)
51
+ 3. Start WebUI with `start-webui.bat` or `start-webui.sh`
52
+ 4. To update the WebUI, run `update.bat` or `update.sh`
53
+
54
+ And you can also run the project with command line arguments if you like to, see [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for a guide to arguments.
55
+
56
+ - ## Running with Docker
57
+
58
+ 1. Install and launch [Docker-Desktop](https://www.docker.com/products/docker-desktop/).
59
+
60
+ 2. Git clone the repository
61
+
62
+ ```sh
63
+ git clone https://github.com/jhj0517/Whisper-WebUI.git
64
+ ```
65
+
66
+ 3. Build the image ( Image is about 7GB~ )
67
+
68
+ ```sh
69
+ docker compose build
70
+ ```
71
+
72
+ 4. Run the container
73
+
74
+ ```sh
75
+ docker compose up
76
+ ```
77
+
78
+ 5. Connect to the WebUI with your browser at `http://localhost:7860`
79
+
80
+ If needed, update the [`docker-compose.yaml`](https://github.com/jhj0517/Whisper-WebUI/blob/master/docker-compose.yaml) to match your environment.
81
+
82
+ # VRAM Usages
83
+ This project is integrated with [faster-whisper](https://github.com/guillaumekln/faster-whisper) by default for better VRAM usage and transcription speed.
84
+
85
+ According to faster-whisper, the efficiency of the optimized whisper model is as follows:
86
+ | Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
87
+ |-------------------|-----------|-----------|-------|-----------------|-----------------|
88
+ | openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
89
+ | faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
90
+
91
+ If you want to use an implementation other than faster-whisper, use `--whisper_type` arg and the repository name.<br>
92
+ Read [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for more info about CLI args.
93
+
94
+ ## Available models
95
+ This is Whisper's original VRAM usage table for models.
96
+
97
+ | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
98
+ |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
99
+ | tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
100
+ | base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
101
+ | small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
102
+ | medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
103
+ | large | 1550 M | N/A | `large` | ~10 GB | 1x |
104
+
105
+
106
+ `.en` models are for English only, and the cool thing is that you can use the `Translate to English` option from the "large" models!
107
+
108
+ ## TODO🗓
109
+
110
+ - [x] Add DeepL API translation
111
+ - [x] Add NLLB Model translation
112
+ - [x] Integrate with faster-whisper
113
+ - [x] Integrate with insanely-fast-whisper
114
+ - [x] Integrate with whisperX ( Only speaker diarization part )
115
+ - [x] Add background music separation pre-processing with [UVR](https://github.com/Anjok07/ultimatevocalremovergui)
116
+ - [ ] Add fast api script
117
+ - [ ] Support real-time transcription for microphone
app.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import gradio as gr
4
+ import yaml
5
+
6
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
7
+ INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
8
+ UVR_MODELS_DIR)
9
+ from modules.utils.files_manager import load_yaml
10
+ from modules.whisper.whisper_factory import WhisperFactory
11
+ from modules.whisper.faster_whisper_inference import FasterWhisperInference
12
+ from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
13
+ from modules.translation.nllb_inference import NLLBInference
14
+ from modules.ui.htmls import *
15
+ from modules.utils.cli_manager import str2bool
16
+ from modules.utils.youtube_manager import get_ytmetas
17
+ from modules.translation.deepl_api import DeepLAPI
18
+ from modules.whisper.whisper_parameter import *
19
+
20
+ ### Device info ###
21
+ import torch
22
+ import torchaudio
23
+ import torch.cuda as cuda
24
+ import platform
25
+ from transformers import __version__ as transformers_version
26
+
27
+ device = "cuda" if torch.cuda.is_available() else "cpu"
28
+ num_gpus = cuda.device_count() if torch.cuda.is_available() else 0
29
+ cuda_version = torch.version.cuda if torch.cuda.is_available() else "N/A"
30
+ cudnn_version = torch.backends.cudnn.version() if torch.cuda.is_available() else "N/A"
31
+ os_info = platform.system() + " " + platform.release() + " " + platform.machine()
32
+
33
+ # Get the available VRAM for each GPU (if available)
34
+ vram_info = []
35
+ if torch.cuda.is_available():
36
+ for i in range(cuda.device_count()):
37
+ gpu_properties = cuda.get_device_properties(i)
38
+ vram_info.append(f"**GPU {i}: {gpu_properties.total_memory / 1024**3:.2f} GB**")
39
+
40
+ pytorch_version = torch.__version__
41
+ torchaudio_version = torchaudio.__version__ if 'torchaudio' in dir() else "N/A"
42
+
43
+ device_info = f"""Running on: **{device}**
44
+
45
+ Number of GPUs available: **{num_gpus}**
46
+
47
+ CUDA version: **{cuda_version}**
48
+
49
+ CuDNN version: **{cudnn_version}**
50
+
51
+ PyTorch version: **{pytorch_version}**
52
+
53
+ Torchaudio version: **{torchaudio_version}**
54
+
55
+ Transformers version: **{transformers_version}**
56
+
57
+ Operating system: **{os_info}**
58
+
59
+ Available VRAM:
60
+ \t {', '.join(vram_info) if vram_info else '**N/A**'}
61
+ """
62
+ ### End Device info ###
63
+
64
+ class App:
65
+ def __init__(self, args):
66
+ self.args = args
67
+ #self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
68
+ self.app = gr.Blocks(css=CSS, theme=gr.themes.Ocean(), delete_cache=(60, 3600))
69
+ self.whisper_inf = WhisperFactory.create_whisper_inference(
70
+ whisper_type=self.args.whisper_type,
71
+ whisper_model_dir=self.args.whisper_model_dir,
72
+ faster_whisper_model_dir=self.args.faster_whisper_model_dir,
73
+ insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
74
+ uvr_model_dir=self.args.uvr_model_dir,
75
+ output_dir=self.args.output_dir,
76
+ )
77
+ self.nllb_inf = NLLBInference(
78
+ model_dir=self.args.nllb_model_dir,
79
+ output_dir=os.path.join(self.args.output_dir, "translations")
80
+ )
81
+ self.deepl_api = DeepLAPI(
82
+ output_dir=os.path.join(self.args.output_dir, "translations")
83
+ )
84
+ self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
85
+ print(f"Use \"{self.args.whisper_type}\" implementation")
86
+ print(f"Device \"{self.whisper_inf.device}\" is detected")
87
+
88
+ def create_whisper_parameters(self):
89
+
90
+ whisper_params = self.default_params["whisper"]
91
+ diarization_params = self.default_params["diarization"]
92
+ vad_params = self.default_params["vad"]
93
+ uvr_params = self.default_params["bgm_separation"]
94
+
95
+ with gr.Row():
96
+ dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model")
97
+ dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language")
98
+ #dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
99
+ dd_file_format = gr.Dropdown(choices=["SRT", "txt"], value="SRT", label="Output format")
100
+
101
+ with gr.Row():
102
+ cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
103
+ cb_diarize = gr.Checkbox(label="Speaker diarization", value=diarization_params["is_diarize"])
104
+ cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English",interactive=True)
105
+
106
+ with gr.Accordion("Diarization options", open=False):
107
+ tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
108
+ info="This is only needed the first time you download the model. If you already have"
109
+ " models, you don't need to enter. To download the model, you must manually go "
110
+ "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
111
+ " their requirement.")
112
+ dd_diarization_device = gr.Dropdown(label="Device",
113
+ choices=self.whisper_inf.diarizer.get_available_device(),
114
+ value=self.whisper_inf.diarizer.get_device())
115
+
116
+ with gr.Accordion("Advanced options", open=False):
117
+ nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
118
+ info="Beam size to use for decoding.")
119
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
120
+ info="If the average log probability over sampled tokens is below this value, treat as failed.")
121
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
122
+ info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
123
+ dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
124
+ value=self.whisper_inf.current_compute_type, interactive=True,
125
+ allow_custom_value=True,
126
+ info="Select the type of computation to perform.")
127
+ nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
128
+ info="Number of candidates when sampling with non-zero temperature.")
129
+ nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
130
+ info="Beam search patience factor.")
131
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
132
+ interactive=True,
133
+ info="Condition on previous text during decoding.")
134
+ sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
135
+ minimum=0, maximum=1, step=0.01, interactive=True,
136
+ info="Resets prompt if temperature is above this value."
137
+ " Arg has effect only if 'Condition On Previous Text' is True.")
138
+ tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
139
+ info="Initial prompt to use for decoding.")
140
+ sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
141
+ step=0.01, maximum=1.0, interactive=True,
142
+ info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
143
+ nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
144
+ interactive=True,
145
+ info="If the gzip compression ratio is above this value, treat as failed.")
146
+ nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
147
+ precision=0,
148
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
149
+ with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
150
+ nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
151
+ info="Exponential length penalty constant.")
152
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
153
+ info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
154
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
155
+ precision=0,
156
+ info="Prevent repetitions of n-grams with this size (set 0 to disable).")
157
+ tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
158
+ info="Optional text to provide as a prefix for the first window.")
159
+ cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=whisper_params["suppress_blank"],
160
+ info="Suppress blank outputs at the beginning of the sampling.")
161
+ tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
162
+ info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
163
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
164
+ info="The initial timestamp cannot be later than this.")
165
+ cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
166
+ info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
167
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
168
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
169
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
170
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
171
+ nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
172
+ precision=0,
173
+ info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
174
+ nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
175
+ value=lambda: whisper_params["hallucination_silence_threshold"],
176
+ info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
177
+ tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
178
+ info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
179
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
180
+ info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
181
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
182
+ precision=0,
183
+ info="Number of segments to consider for the language detection.")
184
+ with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
185
+ nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
186
+
187
+ with gr.Accordion("Background Music Remover Filter", open=False):
188
+ cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
189
+ interactive=True,
190
+ info="Enabling this will remove background music by submodel before"
191
+ " transcribing ")
192
+ dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
193
+ choices=self.whisper_inf.music_separator.available_devices)
194
+ dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
195
+ choices=self.whisper_inf.music_separator.available_models)
196
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
197
+ cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
198
+ cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
199
+ value=uvr_params["enable_offload"])
200
+
201
+ with gr.Accordion("Voice Detection Filter", open=False):
202
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
203
+ interactive=True,
204
+ info="Enable this to transcribe only detected voice parts by submodel.")
205
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
206
+ value=vad_params["threshold"],
207
+ info="Lower it to be more sensitive to small sounds.")
208
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
209
+ value=vad_params["min_speech_duration_ms"],
210
+ info="Final speech chunks shorter than this time are thrown out")
211
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
212
+ value=vad_params["max_speech_duration_s"],
213
+ info="Maximum duration of speech chunks in \"seconds\".")
214
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
215
+ value=vad_params["min_silence_duration_ms"],
216
+ info="In the end of each speech chunk wait for this time"
217
+ " before separating it")
218
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
219
+ info="Final speech chunks are padded by this time each side")
220
+
221
+ dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
222
+
223
+ return (
224
+ WhisperParameters(
225
+ model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
226
+ log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
227
+ compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
228
+ condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
229
+ temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
230
+ vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
231
+ max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
232
+ speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
233
+ is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
234
+ length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
235
+ no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
236
+ suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
237
+ word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
238
+ append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
239
+ hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
240
+ language_detection_threshold=nb_language_detection_threshold,
241
+ language_detection_segments=nb_language_detection_segments,
242
+ prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
243
+ uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
244
+ uvr_save_file=cb_uvr_save_file, uvr_enable_offload=cb_uvr_enable_offload
245
+ ),
246
+ dd_file_format,
247
+ cb_timestamp
248
+ )
249
+
250
+ def launch(self):
251
+ translation_params = self.default_params["translation"]
252
+ deepl_params = translation_params["deepl"]
253
+ nllb_params = translation_params["nllb"]
254
+ uvr_params = self.default_params["bgm_separation"]
255
+
256
+ with self.app:
257
+ with gr.Row():
258
+ with gr.Column():
259
+ gr.Markdown(MARKDOWN, elem_id="md_project")
260
+ with gr.Tabs():
261
+ with gr.TabItem("Audio"): # tab1
262
+ with gr.Column():
263
+ #input_file = gr.Files(type="filepath", label="Upload File here")
264
+ input_file = gr.Audio(type='filepath', elem_id="audio_input")
265
+ tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
266
+ info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
267
+ " Leave this field empty if you do not wish to use a local path.",
268
+ visible=self.args.colab,
269
+ value="")
270
+
271
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
272
+
273
+ with gr.Row():
274
+ btn_run = gr.Button("Transcribe", variant="primary")
275
+ btn_reset = gr.Button(value="Reset")
276
+ btn_reset.click(None,js="window.location.reload()")
277
+ with gr.Row():
278
+ with gr.Column(scale=3):
279
+ tb_indicator = gr.Textbox(label="Output result")
280
+ with gr.Column(scale=1):
281
+ tb_info = gr.Textbox(label="Output info", interactive=False, scale=3)
282
+ files_subtitles = gr.Files(label="Output file", interactive=False, scale=2)
283
+ # btn_openfolder = gr.Button('📂', scale=1)
284
+
285
+ params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
286
+ btn_run.click(fn=self.whisper_inf.transcribe_file,
287
+ inputs=params + whisper_params.as_list(),
288
+ outputs=[tb_indicator, files_subtitles, tb_info])
289
+ # btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
290
+
291
+ with gr.TabItem("Device info"): # tab2
292
+ with gr.Column():
293
+ gr.Markdown(device_info, label="Hardware info & installed packages")
294
+
295
+ # Launch the app with optional gradio settings
296
+ args = self.args
297
+
298
+ self.app.queue(
299
+ api_open=args.api_open
300
+ ).launch(
301
+ share=args.share,
302
+ server_name=args.server_name,
303
+ server_port=args.server_port,
304
+ auth=(args.username, args.password) if args.username and args.password else None,
305
+ root_path=args.root_path,
306
+ inbrowser=args.inbrowser
307
+ )
308
+
309
+ @staticmethod
310
+ def open_folder(folder_path: str):
311
+ if os.path.exists(folder_path):
312
+ os.system(f"start {folder_path}")
313
+ else:
314
+ os.makedirs(folder_path, exist_ok=True)
315
+ print(f"The directory path {folder_path} has newly created.")
316
+
317
+ @staticmethod
318
+ def on_change_models(model_size: str):
319
+ translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
320
+ if model_size not in translatable_model:
321
+ return gr.Checkbox(visible=False, value=False, interactive=False)
322
+ #return gr.Checkbox(visible=True, value=False, label="Translate to English (large models only)", interactive=False)
323
+ else:
324
+ return gr.Checkbox(visible=True, value=False, label="Translate to English", interactive=True)
325
+
326
+
327
+ # Create the parser for command-line arguments
328
+ parser = argparse.ArgumentParser()
329
+ parser.add_argument('--whisper_type', type=str, default="faster-whisper",
330
+ help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
331
+ parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
332
+ parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
333
+ parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
334
+ parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
335
+ parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
336
+ parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
337
+ parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
338
+ parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
339
+ parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
340
+ parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
341
+ parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
342
+ help='Directory path of the whisper model')
343
+ parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
344
+ help='Directory path of the faster-whisper model')
345
+ parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
346
+ default=INSANELY_FAST_WHISPER_MODELS_DIR,
347
+ help='Directory path of the insanely-fast-whisper model')
348
+ parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MODELS_DIR,
349
+ help='Directory path of the diarization model')
350
+ parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
351
+ help='Directory path of the Facebook NLLB model')
352
+ parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
353
+ help='Directory path of the UVR model')
354
+ parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
355
+ _args = parser.parse_args()
356
+
357
+ if __name__ == "__main__":
358
+ app = App(args=_args)
359
+ app.launch()
configs/default_parameters.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ whisper:
2
+ model_size: "large-v3"
3
+ lang: "Automatic Detection"
4
+ is_translate: false
5
+ beam_size: 5
6
+ log_prob_threshold: -1
7
+ no_speech_threshold: 0.6
8
+ best_of: 5
9
+ patience: 1
10
+ condition_on_previous_text: true
11
+ prompt_reset_on_temperature: 0.5
12
+ initial_prompt: null
13
+ temperature: 0
14
+ compression_ratio_threshold: 2.4
15
+ chunk_length: 30
16
+ batch_size: 24
17
+ length_penalty: 1
18
+ repetition_penalty: 1
19
+ no_repeat_ngram_size: 0
20
+ prefix: null
21
+ suppress_blank: true
22
+ suppress_tokens: "[-1]"
23
+ max_initial_timestamp: 1
24
+ word_timestamps: false
25
+ prepend_punctuations: "\"'“¿([{-"
26
+ append_punctuations: "\"'.。,,!!??::”)]}、"
27
+ max_new_tokens: null
28
+ hallucination_silence_threshold: null
29
+ hotwords: null
30
+ language_detection_threshold: null
31
+ language_detection_segments: 1
32
+ add_timestamp: false
33
+
34
+ vad:
35
+ vad_filter: false
36
+ threshold: 0.5
37
+ min_speech_duration_ms: 250
38
+ max_speech_duration_s: 9999
39
+ min_silence_duration_ms: 1000
40
+ speech_pad_ms: 2000
41
+
42
+ diarization:
43
+ is_diarize: false
44
+ hf_token: ""
45
+
46
+ bgm_separation:
47
+ is_separate_bgm: false
48
+ model_size: "UVR-MDX-NET-Inst_HQ_4"
49
+ segment_size: 256
50
+ save_file: false
51
+ enable_offload: true
52
+
53
+ translation:
54
+ deepl:
55
+ api_key: ""
56
+ is_pro: false
57
+ source_lang: "Automatic Detection"
58
+ target_lang: "English"
59
+ nllb:
60
+ model_size: "facebook/nllb-200-1.3B"
61
+ source_lang: null
62
+ target_lang: null
63
+ max_length: 200
64
+ add_timestamp: true
demo/audio.wav ADDED
Binary file (209 kB). View file
 
docker-compose.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ app:
3
+ build: .
4
+ image: whisper-webui:latest
5
+
6
+ volumes:
7
+ # Update paths to mount models and output paths to your custom paths like this, e.g:
8
+ # - C:/whisper-models/custom-path:/Whisper-WebUI/models
9
+ # - C:/whisper-webui-outputs/custom-path:/Whisper-WebUI/outputs
10
+ - /Whisper-WebUI/models
11
+ - /Whisper-WebUI/outputs
12
+
13
+ ports:
14
+ - "7860:7860"
15
+
16
+ stdin_open: true
17
+ tty: true
18
+
19
+ entrypoint: ["python", "app.py", "--server_port", "7860", "--server_name", "0.0.0.0",]
20
+
21
+ # If you're not using nvidia GPU, Update device to match yours.
22
+ # See more info at : https://docs.docker.com/compose/compose-file/deploy/#driver
23
+ deploy:
24
+ resources:
25
+ reservations:
26
+ devices:
27
+ - driver: nvidia
28
+ count: all
29
+ capabilities: [ gpu ]
models/models will be saved here.txt ADDED
File without changes
modules/__init__.py ADDED
File without changes
modules/diarize/__init__.py ADDED
File without changes
modules/diarize/audio_loader.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
2
+
3
+ import os
4
+ import subprocess
5
+ from functools import lru_cache
6
+ from typing import Optional, Union
7
+ from scipy.io.wavfile import write
8
+ import tempfile
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn.functional as F
13
+
14
+ def exact_div(x, y):
15
+ assert x % y == 0
16
+ return x // y
17
+
18
+ # hard-coded audio hyperparameters
19
+ SAMPLE_RATE = 16000
20
+ N_FFT = 400
21
+ HOP_LENGTH = 160
22
+ CHUNK_LENGTH = 30
23
+ N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
24
+ N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input
25
+
26
+ N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
27
+ FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
28
+ TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
29
+
30
+
31
+ def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
32
+ """
33
+ Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
34
+
35
+ Parameters
36
+ ----------
37
+ file: Union[str, np.ndarray]
38
+ The audio file to open or a numpy array containing the audio data.
39
+
40
+ sr: int
41
+ The sample rate to resample the audio if necessary.
42
+
43
+ Returns
44
+ -------
45
+ A NumPy array containing the audio waveform, in float32 dtype.
46
+ """
47
+ if isinstance(file, np.ndarray):
48
+ if file.dtype != np.float32:
49
+ file = file.astype(np.float32)
50
+ if file.ndim > 1:
51
+ file = np.mean(file, axis=1)
52
+
53
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
54
+ write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
55
+ temp_file_path = temp_file.name
56
+ temp_file.close()
57
+ else:
58
+ temp_file_path = file
59
+
60
+ try:
61
+ cmd = [
62
+ "ffmpeg",
63
+ "-nostdin",
64
+ "-threads",
65
+ "0",
66
+ "-i",
67
+ temp_file_path,
68
+ "-f",
69
+ "s16le",
70
+ "-ac",
71
+ "1",
72
+ "-acodec",
73
+ "pcm_s16le",
74
+ "-ar",
75
+ str(sr),
76
+ "-",
77
+ ]
78
+ out = subprocess.run(cmd, capture_output=True, check=True).stdout
79
+ except subprocess.CalledProcessError as e:
80
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
81
+ finally:
82
+ if isinstance(file, np.ndarray):
83
+ os.remove(temp_file_path)
84
+
85
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
86
+
87
+
88
+ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
89
+ """
90
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
91
+ """
92
+ if torch.is_tensor(array):
93
+ if array.shape[axis] > length:
94
+ array = array.index_select(
95
+ dim=axis, index=torch.arange(length, device=array.device)
96
+ )
97
+
98
+ if array.shape[axis] < length:
99
+ pad_widths = [(0, 0)] * array.ndim
100
+ pad_widths[axis] = (0, length - array.shape[axis])
101
+ array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
102
+ else:
103
+ if array.shape[axis] > length:
104
+ array = array.take(indices=range(length), axis=axis)
105
+
106
+ if array.shape[axis] < length:
107
+ pad_widths = [(0, 0)] * array.ndim
108
+ pad_widths[axis] = (0, length - array.shape[axis])
109
+ array = np.pad(array, pad_widths)
110
+
111
+ return array
112
+
113
+
114
+ @lru_cache(maxsize=None)
115
+ def mel_filters(device, n_mels: int) -> torch.Tensor:
116
+ """
117
+ load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
118
+ Allows decoupling librosa dependency; saved using:
119
+
120
+ np.savez_compressed(
121
+ "mel_filters.npz",
122
+ mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
123
+ )
124
+ """
125
+ assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
126
+ with np.load(
127
+ os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
128
+ ) as f:
129
+ return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
130
+
131
+
132
+ def log_mel_spectrogram(
133
+ audio: Union[str, np.ndarray, torch.Tensor],
134
+ n_mels: int,
135
+ padding: int = 0,
136
+ device: Optional[Union[str, torch.device]] = None,
137
+ ):
138
+ """
139
+ Compute the log-Mel spectrogram of
140
+
141
+ Parameters
142
+ ----------
143
+ audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
144
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
145
+
146
+ n_mels: int
147
+ The number of Mel-frequency filters, only 80 is supported
148
+
149
+ padding: int
150
+ Number of zero samples to pad to the right
151
+
152
+ device: Optional[Union[str, torch.device]]
153
+ If given, the audio tensor is moved to this device before STFT
154
+
155
+ Returns
156
+ -------
157
+ torch.Tensor, shape = (80, n_frames)
158
+ A Tensor that contains the Mel spectrogram
159
+ """
160
+ if not torch.is_tensor(audio):
161
+ if isinstance(audio, str):
162
+ audio = load_audio(audio)
163
+ audio = torch.from_numpy(audio)
164
+
165
+ if device is not None:
166
+ audio = audio.to(device)
167
+ if padding > 0:
168
+ audio = F.pad(audio, (0, padding))
169
+ window = torch.hann_window(N_FFT).to(audio.device)
170
+ stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
171
+ magnitudes = stft[..., :-1].abs() ** 2
172
+
173
+ filters = mel_filters(audio.device, n_mels)
174
+ mel_spec = filters @ magnitudes
175
+
176
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
177
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
178
+ log_spec = (log_spec + 4.0) / 4.0
179
+ return log_spec
modules/diarize/diarize_pipeline.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import os
6
+ from pyannote.audio import Pipeline
7
+ from typing import Optional, Union
8
+ import torch
9
+
10
+ from modules.utils.paths import DIARIZATION_MODELS_DIR
11
+ from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
12
+
13
+
14
+ class DiarizationPipeline:
15
+ def __init__(
16
+ self,
17
+ model_name="pyannote/speaker-diarization-3.1",
18
+ cache_dir: str = DIARIZATION_MODELS_DIR,
19
+ use_auth_token=None,
20
+ device: Optional[Union[str, torch.device]] = "cpu",
21
+ ):
22
+ if isinstance(device, str):
23
+ device = torch.device(device)
24
+ self.model = Pipeline.from_pretrained(
25
+ model_name,
26
+ use_auth_token=use_auth_token,
27
+ cache_dir=cache_dir
28
+ ).to(device)
29
+
30
+ def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
31
+ if isinstance(audio, str):
32
+ audio = load_audio(audio)
33
+ audio_data = {
34
+ 'waveform': torch.from_numpy(audio[None, :]),
35
+ 'sample_rate': SAMPLE_RATE
36
+ }
37
+ segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
38
+ diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
39
+ diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
40
+ diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
41
+ return diarize_df
42
+
43
+
44
+ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
45
+ transcript_segments = transcript_result["segments"]
46
+ for seg in transcript_segments:
47
+ # assign speaker to segment (if any)
48
+ diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
49
+ seg['start'])
50
+ diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
51
+
52
+ intersected = diarize_df[diarize_df["intersection"] > 0]
53
+
54
+ speaker = None
55
+ if len(intersected) > 0:
56
+ # Choosing most strong intersection
57
+ speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
58
+ elif fill_nearest:
59
+ # Otherwise choosing closest
60
+ speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
61
+
62
+ if speaker is not None:
63
+ seg["speaker"] = speaker
64
+
65
+ # assign speaker to words
66
+ if 'words' in seg:
67
+ for word in seg['words']:
68
+ if 'start' in word:
69
+ diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
70
+ diarize_df['start'], word['start'])
71
+ diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
72
+ word['start'])
73
+
74
+ intersected = diarize_df[diarize_df["intersection"] > 0]
75
+
76
+ word_speaker = None
77
+ if len(intersected) > 0:
78
+ # Choosing most strong intersection
79
+ word_speaker = \
80
+ intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
81
+ elif fill_nearest:
82
+ # Otherwise choosing closest
83
+ word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
84
+
85
+ if word_speaker is not None:
86
+ word["speaker"] = word_speaker
87
+
88
+ return transcript_result
89
+
90
+
91
+ class Segment:
92
+ def __init__(self, start, end, speaker=None):
93
+ self.start = start
94
+ self.end = end
95
+ self.speaker = speaker
modules/diarize/diarizer.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from typing import List, Union, BinaryIO, Optional
4
+ import numpy as np
5
+ import time
6
+ import logging
7
+
8
+ from modules.utils.paths import DIARIZATION_MODELS_DIR
9
+ from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
10
+ from modules.diarize.audio_loader import load_audio
11
+
12
+
13
+ class Diarizer:
14
+ def __init__(self,
15
+ model_dir: str = DIARIZATION_MODELS_DIR
16
+ ):
17
+ self.device = self.get_device()
18
+ self.available_device = self.get_available_device()
19
+ self.compute_type = "float16"
20
+ self.model_dir = model_dir
21
+ os.makedirs(self.model_dir, exist_ok=True)
22
+ self.pipe = None
23
+
24
+ def run(self,
25
+ audio: Union[str, BinaryIO, np.ndarray],
26
+ transcribed_result: List[dict],
27
+ use_auth_token: str,
28
+ device: Optional[str] = None
29
+ ):
30
+ """
31
+ Diarize transcribed result as a post-processing
32
+
33
+ Parameters
34
+ ----------
35
+ audio: Union[str, BinaryIO, np.ndarray]
36
+ Audio input. This can be file path or binary type.
37
+ transcribed_result: List[dict]
38
+ transcribed result through whisper.
39
+ use_auth_token: str
40
+ Huggingface token with READ permission. This is only needed the first time you download the model.
41
+ You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
42
+ device: Optional[str]
43
+ Device for diarization.
44
+
45
+ Returns
46
+ ----------
47
+ segments_result: List[dict]
48
+ list of dicts that includes start, end timestamps and transcribed text
49
+ elapsed_time: float
50
+ elapsed time for running
51
+ """
52
+ start_time = time.time()
53
+
54
+ if device is None:
55
+ device = self.device
56
+
57
+ if device != self.device or self.pipe is None:
58
+ self.update_pipe(
59
+ device=device,
60
+ use_auth_token=use_auth_token
61
+ )
62
+
63
+ audio = load_audio(audio)
64
+
65
+ diarization_segments = self.pipe(audio)
66
+ diarized_result = assign_word_speakers(
67
+ diarization_segments,
68
+ {"segments": transcribed_result}
69
+ )
70
+
71
+ for segment in diarized_result["segments"]:
72
+ speaker = "None"
73
+ if "speaker" in segment:
74
+ speaker = segment["speaker"]
75
+ segment["text"] = speaker + ": " + segment["text"].strip()
76
+
77
+ elapsed_time = time.time() - start_time
78
+ return diarized_result["segments"], elapsed_time
79
+
80
+ def update_pipe(self,
81
+ use_auth_token: str,
82
+ device: str
83
+ ):
84
+ """
85
+ Set pipeline for diarization
86
+
87
+ Parameters
88
+ ----------
89
+ use_auth_token: str
90
+ Huggingface token with READ permission. This is only needed the first time you download the model.
91
+ You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
92
+ device: str
93
+ Device for diarization.
94
+ """
95
+ self.device = device
96
+
97
+ os.makedirs(self.model_dir, exist_ok=True)
98
+
99
+ if (not os.listdir(self.model_dir) and
100
+ not use_auth_token):
101
+ print(
102
+ "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
103
+ "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
104
+ )
105
+ return
106
+
107
+ logger = logging.getLogger("speechbrain.utils.train_logger")
108
+ # Disable redundant torchvision warning message
109
+ logger.disabled = True
110
+ self.pipe = DiarizationPipeline(
111
+ use_auth_token=use_auth_token,
112
+ device=device,
113
+ cache_dir=self.model_dir
114
+ )
115
+ logger.disabled = False
116
+
117
+ @staticmethod
118
+ def get_device():
119
+ if torch.cuda.is_available():
120
+ return "cuda"
121
+ elif torch.backends.mps.is_available():
122
+ return "mps"
123
+ else:
124
+ return "cpu"
125
+
126
+ @staticmethod
127
+ def get_available_device():
128
+ devices = ["cpu"]
129
+ if torch.cuda.is_available():
130
+ devices.append("cuda")
131
+ elif torch.backends.mps.is_available():
132
+ devices.append("mps")
133
+ return devices
modules/translation/__init__.py ADDED
File without changes
modules/translation/deepl_api.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import os
4
+ from datetime import datetime
5
+ import gradio as gr
6
+
7
+ from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
8
+ from modules.utils.subtitle_manager import *
9
+ from modules.utils.files_manager import load_yaml, save_yaml
10
+
11
+ """
12
+ This is written with reference to the DeepL API documentation.
13
+ If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
14
+ """
15
+
16
+ DEEPL_AVAILABLE_TARGET_LANGS = {
17
+ 'Bulgarian': 'BG',
18
+ 'Czech': 'CS',
19
+ 'Danish': 'DA',
20
+ 'German': 'DE',
21
+ 'Greek': 'EL',
22
+ 'English': 'EN',
23
+ 'English (British)': 'EN-GB',
24
+ 'English (American)': 'EN-US',
25
+ 'Spanish': 'ES',
26
+ 'Estonian': 'ET',
27
+ 'Finnish': 'FI',
28
+ 'French': 'FR',
29
+ 'Hungarian': 'HU',
30
+ 'Indonesian': 'ID',
31
+ 'Italian': 'IT',
32
+ 'Japanese': 'JA',
33
+ 'Korean': 'KO',
34
+ 'Lithuanian': 'LT',
35
+ 'Latvian': 'LV',
36
+ 'Norwegian (Bokmål)': 'NB',
37
+ 'Dutch': 'NL',
38
+ 'Polish': 'PL',
39
+ 'Portuguese': 'PT',
40
+ 'Portuguese (Brazilian)': 'PT-BR',
41
+ 'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
42
+ 'Romanian': 'RO',
43
+ 'Russian': 'RU',
44
+ 'Slovak': 'SK',
45
+ 'Slovenian': 'SL',
46
+ 'Swedish': 'SV',
47
+ 'Turkish': 'TR',
48
+ 'Ukrainian': 'UK',
49
+ 'Chinese (simplified)': 'ZH'
50
+ }
51
+
52
+ DEEPL_AVAILABLE_SOURCE_LANGS = {
53
+ 'Automatic Detection': None,
54
+ 'Bulgarian': 'BG',
55
+ 'Czech': 'CS',
56
+ 'Danish': 'DA',
57
+ 'German': 'DE',
58
+ 'Greek': 'EL',
59
+ 'English': 'EN',
60
+ 'Spanish': 'ES',
61
+ 'Estonian': 'ET',
62
+ 'Finnish': 'FI',
63
+ 'French': 'FR',
64
+ 'Hungarian': 'HU',
65
+ 'Indonesian': 'ID',
66
+ 'Italian': 'IT',
67
+ 'Japanese': 'JA',
68
+ 'Korean': 'KO',
69
+ 'Lithuanian': 'LT',
70
+ 'Latvian': 'LV',
71
+ 'Norwegian (Bokmål)': 'NB',
72
+ 'Dutch': 'NL',
73
+ 'Polish': 'PL',
74
+ 'Portuguese (all Portuguese varieties mixed)': 'PT',
75
+ 'Romanian': 'RO',
76
+ 'Russian': 'RU',
77
+ 'Slovak': 'SK',
78
+ 'Slovenian': 'SL',
79
+ 'Swedish': 'SV',
80
+ 'Turkish': 'TR',
81
+ 'Ukrainian': 'UK',
82
+ 'Chinese': 'ZH'
83
+ }
84
+
85
+
86
+ class DeepLAPI:
87
+ def __init__(self,
88
+ output_dir: str = TRANSLATION_OUTPUT_DIR
89
+ ):
90
+ self.api_interval = 1
91
+ self.max_text_batch_size = 50
92
+ self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
93
+ self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
94
+ self.output_dir = output_dir
95
+
96
+ def translate_deepl(self,
97
+ auth_key: str,
98
+ fileobjs: list,
99
+ source_lang: str,
100
+ target_lang: str,
101
+ is_pro: bool = False,
102
+ add_timestamp: bool = True,
103
+ progress=gr.Progress()) -> list:
104
+ """
105
+ Translate subtitle files using DeepL API
106
+ Parameters
107
+ ----------
108
+ auth_key: str
109
+ API Key for DeepL from gr.Textbox()
110
+ fileobjs: list
111
+ List of files to transcribe from gr.Files()
112
+ source_lang: str
113
+ Source language of the file to transcribe from gr.Dropdown()
114
+ target_lang: str
115
+ Target language of the file to transcribe from gr.Dropdown()
116
+ is_pro: str
117
+ Boolean value that is about pro user or not from gr.Checkbox().
118
+ add_timestamp: bool
119
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
120
+ progress: gr.Progress
121
+ Indicator to show progress directly in gradio.
122
+
123
+ Returns
124
+ ----------
125
+ A List of
126
+ String to return to gr.Textbox()
127
+ Files to return to gr.Files()
128
+ """
129
+ if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
130
+ fileobjs = [fileobj.name for fileobj in fileobjs]
131
+
132
+ self.cache_parameters(
133
+ api_key=auth_key,
134
+ is_pro=is_pro,
135
+ source_lang=source_lang,
136
+ target_lang=target_lang,
137
+ add_timestamp=add_timestamp
138
+ )
139
+
140
+ files_info = {}
141
+ for fileobj in fileobjs:
142
+ file_path = fileobj
143
+ file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
144
+
145
+ if file_ext == ".srt":
146
+ parsed_dicts = parse_srt(file_path=file_path)
147
+
148
+ elif file_ext == ".vtt":
149
+ parsed_dicts = parse_vtt(file_path=file_path)
150
+
151
+ batch_size = self.max_text_batch_size
152
+ for batch_start in range(0, len(parsed_dicts), batch_size):
153
+ batch_end = min(batch_start + batch_size, len(parsed_dicts))
154
+ sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
155
+ translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
156
+ target_lang, is_pro)
157
+ for i, translated_text in enumerate(translated_texts):
158
+ parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
159
+ progress(batch_end / len(parsed_dicts), desc="Translating..")
160
+
161
+ if file_ext == ".srt":
162
+ subtitle = get_serialized_srt(parsed_dicts)
163
+ elif file_ext == ".vtt":
164
+ subtitle = get_serialized_vtt(parsed_dicts)
165
+
166
+ if add_timestamp:
167
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
168
+ file_name += f"-{timestamp}"
169
+
170
+ output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
171
+ write_file(subtitle, output_path)
172
+
173
+ files_info[file_name] = {"subtitle": subtitle, "path": output_path}
174
+
175
+ total_result = ''
176
+ for file_name, info in files_info.items():
177
+ total_result += '------------------------------------\n'
178
+ total_result += f'{file_name}\n\n'
179
+ total_result += f'{info["subtitle"]}'
180
+ gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
181
+
182
+ output_file_paths = [item["path"] for key, item in files_info.items()]
183
+ return [gr_str, output_file_paths]
184
+
185
+ def request_deepl_translate(self,
186
+ auth_key: str,
187
+ text: list,
188
+ source_lang: str,
189
+ target_lang: str,
190
+ is_pro: bool = False):
191
+ """Request API response to DeepL server"""
192
+ if source_lang not in list(DEEPL_AVAILABLE_SOURCE_LANGS.keys()):
193
+ raise ValueError(f"Source language {source_lang} is not supported."
194
+ f"Use one of {list(DEEPL_AVAILABLE_SOURCE_LANGS.keys())}")
195
+ if target_lang not in list(DEEPL_AVAILABLE_TARGET_LANGS.keys()):
196
+ raise ValueError(f"Target language {target_lang} is not supported."
197
+ f"Use one of {list(DEEPL_AVAILABLE_TARGET_LANGS.keys())}")
198
+
199
+ url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
200
+ headers = {
201
+ 'Authorization': f'DeepL-Auth-Key {auth_key}'
202
+ }
203
+ data = {
204
+ 'text': text,
205
+ 'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
206
+ 'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
207
+ }
208
+ response = requests.post(url, headers=headers, data=data).json()
209
+ time.sleep(self.api_interval)
210
+ return response["translations"]
211
+
212
+ @staticmethod
213
+ def cache_parameters(api_key: str,
214
+ is_pro: bool,
215
+ source_lang: str,
216
+ target_lang: str,
217
+ add_timestamp: bool):
218
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
219
+ cached_params["translation"]["deepl"] = {
220
+ "api_key": api_key,
221
+ "is_pro": is_pro,
222
+ "source_lang": source_lang,
223
+ "target_lang": target_lang
224
+ }
225
+ cached_params["translation"]["add_timestamp"] = add_timestamp
226
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/translation/nllb_inference.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
2
+ import gradio as gr
3
+ import os
4
+
5
+ from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
6
+ from modules.translation.translation_base import TranslationBase
7
+
8
+
9
+ class NLLBInference(TranslationBase):
10
+ def __init__(self,
11
+ model_dir: str = NLLB_MODELS_DIR,
12
+ output_dir: str = TRANSLATION_OUTPUT_DIR
13
+ ):
14
+ super().__init__(
15
+ model_dir=model_dir,
16
+ output_dir=output_dir
17
+ )
18
+ self.tokenizer = None
19
+ self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
20
+ self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
21
+ self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
22
+ self.pipeline = None
23
+
24
+ def translate(self,
25
+ text: str,
26
+ max_length: int
27
+ ):
28
+ result = self.pipeline(
29
+ text,
30
+ max_length=max_length
31
+ )
32
+ return result[0]['translation_text']
33
+
34
+ def update_model(self,
35
+ model_size: str,
36
+ src_lang: str,
37
+ tgt_lang: str,
38
+ progress: gr.Progress = gr.Progress()
39
+ ):
40
+ def validate_language(lang: str) -> str:
41
+ if lang in NLLB_AVAILABLE_LANGS:
42
+ return NLLB_AVAILABLE_LANGS[lang]
43
+ elif lang not in NLLB_AVAILABLE_LANGS.values():
44
+ raise ValueError(
45
+ f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
46
+ return lang
47
+
48
+ src_lang = validate_language(src_lang)
49
+ tgt_lang = validate_language(tgt_lang)
50
+
51
+ if model_size != self.current_model_size or self.model is None:
52
+ print("\nInitializing NLLB Model..\n")
53
+ progress(0, desc="Initializing NLLB Model..")
54
+ self.current_model_size = model_size
55
+ local_files_only = self.is_model_exists(self.current_model_size)
56
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
57
+ cache_dir=self.model_dir,
58
+ local_files_only=local_files_only)
59
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
60
+ cache_dir=os.path.join(self.model_dir, "tokenizers"),
61
+ local_files_only=local_files_only)
62
+
63
+ self.pipeline = pipeline("translation",
64
+ model=self.model,
65
+ tokenizer=self.tokenizer,
66
+ src_lang=src_lang,
67
+ tgt_lang=tgt_lang,
68
+ device=self.device)
69
+
70
+ def is_model_exists(self,
71
+ model_size: str):
72
+ """Check if model exists or not (Only facebook model)"""
73
+ prefix = "models--facebook--"
74
+ _id, model_size_name = model_size.split("/")
75
+ model_dir_name = prefix + model_size_name
76
+ model_dir_path = os.path.join(self.model_dir, model_dir_name)
77
+ if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
78
+ return True
79
+ return False
80
+
81
+
82
+ NLLB_AVAILABLE_LANGS = {
83
+ "Acehnese (Arabic script)": "ace_Arab",
84
+ "Acehnese (Latin script)": "ace_Latn",
85
+ "Mesopotamian Arabic": "acm_Arab",
86
+ "Ta’izzi-Adeni Arabic": "acq_Arab",
87
+ "Tunisian Arabic": "aeb_Arab",
88
+ "Afrikaans": "afr_Latn",
89
+ "South Levantine Arabic": "ajp_Arab",
90
+ "Akan": "aka_Latn",
91
+ "Amharic": "amh_Ethi",
92
+ "North Levantine Arabic": "apc_Arab",
93
+ "Modern Standard Arabic": "arb_Arab",
94
+ "Modern Standard Arabic (Romanized)": "arb_Latn",
95
+ "Najdi Arabic": "ars_Arab",
96
+ "Moroccan Arabic": "ary_Arab",
97
+ "Egyptian Arabic": "arz_Arab",
98
+ "Assamese": "asm_Beng",
99
+ "Asturian": "ast_Latn",
100
+ "Awadhi": "awa_Deva",
101
+ "Central Aymara": "ayr_Latn",
102
+ "South Azerbaijani": "azb_Arab",
103
+ "North Azerbaijani": "azj_Latn",
104
+ "Bashkir": "bak_Cyrl",
105
+ "Bambara": "bam_Latn",
106
+ "Balinese": "ban_Latn",
107
+ "Belarusian": "bel_Cyrl",
108
+ "Bemba": "bem_Latn",
109
+ "Bengali": "ben_Beng",
110
+ "Bhojpuri": "bho_Deva",
111
+ "Banjar (Arabic script)": "bjn_Arab",
112
+ "Banjar (Latin script)": "bjn_Latn",
113
+ "Standard Tibetan": "bod_Tibt",
114
+ "Bosnian": "bos_Latn",
115
+ "Buginese": "bug_Latn",
116
+ "Bulgarian": "bul_Cyrl",
117
+ "Catalan": "cat_Latn",
118
+ "Cebuano": "ceb_Latn",
119
+ "Czech": "ces_Latn",
120
+ "Chokwe": "cjk_Latn",
121
+ "Central Kurdish": "ckb_Arab",
122
+ "Crimean Tatar": "crh_Latn",
123
+ "Welsh": "cym_Latn",
124
+ "Danish": "dan_Latn",
125
+ "German": "deu_Latn",
126
+ "Southwestern Dinka": "dik_Latn",
127
+ "Dyula": "dyu_Latn",
128
+ "Dzongkha": "dzo_Tibt",
129
+ "Greek": "ell_Grek",
130
+ "English": "eng_Latn",
131
+ "Esperanto": "epo_Latn",
132
+ "Estonian": "est_Latn",
133
+ "Basque": "eus_Latn",
134
+ "Ewe": "ewe_Latn",
135
+ "Faroese": "fao_Latn",
136
+ "Fijian": "fij_Latn",
137
+ "Finnish": "fin_Latn",
138
+ "Fon": "fon_Latn",
139
+ "French": "fra_Latn",
140
+ "Friulian": "fur_Latn",
141
+ "Nigerian Fulfulde": "fuv_Latn",
142
+ "Scottish Gaelic": "gla_Latn",
143
+ "Irish": "gle_Latn",
144
+ "Galician": "glg_Latn",
145
+ "Guarani": "grn_Latn",
146
+ "Gujarati": "guj_Gujr",
147
+ "Haitian Creole": "hat_Latn",
148
+ "Hausa": "hau_Latn",
149
+ "Hebrew": "heb_Hebr",
150
+ "Hindi": "hin_Deva",
151
+ "Chhattisgarhi": "hne_Deva",
152
+ "Croatian": "hrv_Latn",
153
+ "Hungarian": "hun_Latn",
154
+ "Armenian": "hye_Armn",
155
+ "Igbo": "ibo_Latn",
156
+ "Ilocano": "ilo_Latn",
157
+ "Indonesian": "ind_Latn",
158
+ "Icelandic": "isl_Latn",
159
+ "Italian": "ita_Latn",
160
+ "Javanese": "jav_Latn",
161
+ "Japanese": "jpn_Jpan",
162
+ "Kabyle": "kab_Latn",
163
+ "Jingpho": "kac_Latn",
164
+ "Kamba": "kam_Latn",
165
+ "Kannada": "kan_Knda",
166
+ "Kashmiri (Arabic script)": "kas_Arab",
167
+ "Kashmiri (Devanagari script)": "kas_Deva",
168
+ "Georgian": "kat_Geor",
169
+ "Central Kanuri (Arabic script)": "knc_Arab",
170
+ "Central Kanuri (Latin script)": "knc_Latn",
171
+ "Kazakh": "kaz_Cyrl",
172
+ "Kabiyè": "kbp_Latn",
173
+ "Kabuverdianu": "kea_Latn",
174
+ "Khmer": "khm_Khmr",
175
+ "Kikuyu": "kik_Latn",
176
+ "Kinyarwanda": "kin_Latn",
177
+ "Kyrgyz": "kir_Cyrl",
178
+ "Kimbundu": "kmb_Latn",
179
+ "Northern Kurdish": "kmr_Latn",
180
+ "Kikongo": "kon_Latn",
181
+ "Korean": "kor_Hang",
182
+ "Lao": "lao_Laoo",
183
+ "Ligurian": "lij_Latn",
184
+ "Limburgish": "lim_Latn",
185
+ "Lingala": "lin_Latn",
186
+ "Lithuanian": "lit_Latn",
187
+ "Lombard": "lmo_Latn",
188
+ "Latgalian": "ltg_Latn",
189
+ "Luxembourgish": "ltz_Latn",
190
+ "Luba-Kasai": "lua_Latn",
191
+ "Ganda": "lug_Latn",
192
+ "Luo": "luo_Latn",
193
+ "Mizo": "lus_Latn",
194
+ "Standard Latvian": "lvs_Latn",
195
+ "Magahi": "mag_Deva",
196
+ "Maithili": "mai_Deva",
197
+ "Malayalam": "mal_Mlym",
198
+ "Marathi": "mar_Deva",
199
+ "Minangkabau (Arabic script)": "min_Arab",
200
+ "Minangkabau (Latin script)": "min_Latn",
201
+ "Macedonian": "mkd_Cyrl",
202
+ "Plateau Malagasy": "plt_Latn",
203
+ "Maltese": "mlt_Latn",
204
+ "Meitei (Bengali script)": "mni_Beng",
205
+ "Halh Mongolian": "khk_Cyrl",
206
+ "Mossi": "mos_Latn",
207
+ "Maori": "mri_Latn",
208
+ "Burmese": "mya_Mymr",
209
+ "Dutch": "nld_Latn",
210
+ "Norwegian Nynorsk": "nno_Latn",
211
+ "Norwegian Bokmål": "nob_Latn",
212
+ "Nepali": "npi_Deva",
213
+ "Northern Sotho": "nso_Latn",
214
+ "Nuer": "nus_Latn",
215
+ "Nyanja": "nya_Latn",
216
+ "Occitan": "oci_Latn",
217
+ "West Central Oromo": "gaz_Latn",
218
+ "Odia": "ory_Orya",
219
+ "Pangasinan": "pag_Latn",
220
+ "Eastern Panjabi": "pan_Guru",
221
+ "Papiamento": "pap_Latn",
222
+ "Western Persian": "pes_Arab",
223
+ "Polish": "pol_Latn",
224
+ "Portuguese": "por_Latn",
225
+ "Dari": "prs_Arab",
226
+ "Southern Pashto": "pbt_Arab",
227
+ "Ayacucho Quechua": "quy_Latn",
228
+ "Romanian": "ron_Latn",
229
+ "Rundi": "run_Latn",
230
+ "Russian": "rus_Cyrl",
231
+ "Sango": "sag_Latn",
232
+ "Sanskrit": "san_Deva",
233
+ "Santali": "sat_Olck",
234
+ "Sicilian": "scn_Latn",
235
+ "Shan": "shn_Mymr",
236
+ "Sinhala": "sin_Sinh",
237
+ "Slovak": "slk_Latn",
238
+ "Slovenian": "slv_Latn",
239
+ "Samoan": "smo_Latn",
240
+ "Shona": "sna_Latn",
241
+ "Sindhi": "snd_Arab",
242
+ "Somali": "som_Latn",
243
+ "Southern Sotho": "sot_Latn",
244
+ "Spanish": "spa_Latn",
245
+ "Tosk Albanian": "als_Latn",
246
+ "Sardinian": "srd_Latn",
247
+ "Serbian": "srp_Cyrl",
248
+ "Swati": "ssw_Latn",
249
+ "Sundanese": "sun_Latn",
250
+ "Swedish": "swe_Latn",
251
+ "Swahili": "swh_Latn",
252
+ "Silesian": "szl_Latn",
253
+ "Tamil": "tam_Taml",
254
+ "Tatar": "tat_Cyrl",
255
+ "Telugu": "tel_Telu",
256
+ "Tajik": "tgk_Cyrl",
257
+ "Tagalog": "tgl_Latn",
258
+ "Thai": "tha_Thai",
259
+ "Tigrinya": "tir_Ethi",
260
+ "Tamasheq (Latin script)": "taq_Latn",
261
+ "Tamasheq (Tifinagh script)": "taq_Tfng",
262
+ "Tok Pisin": "tpi_Latn",
263
+ "Tswana": "tsn_Latn",
264
+ "Tsonga": "tso_Latn",
265
+ "Turkmen": "tuk_Latn",
266
+ "Tumbuka": "tum_Latn",
267
+ "Turkish": "tur_Latn",
268
+ "Twi": "twi_Latn",
269
+ "Central Atlas Tamazight": "tzm_Tfng",
270
+ "Uyghur": "uig_Arab",
271
+ "Ukrainian": "ukr_Cyrl",
272
+ "Umbundu": "umb_Latn",
273
+ "Urdu": "urd_Arab",
274
+ "Northern Uzbek": "uzn_Latn",
275
+ "Venetian": "vec_Latn",
276
+ "Vietnamese": "vie_Latn",
277
+ "Waray": "war_Latn",
278
+ "Wolof": "wol_Latn",
279
+ "Xhosa": "xho_Latn",
280
+ "Eastern Yiddish": "ydd_Hebr",
281
+ "Yoruba": "yor_Latn",
282
+ "Yue Chinese": "yue_Hant",
283
+ "Chinese (Simplified)": "zho_Hans",
284
+ "Chinese (Traditional)": "zho_Hant",
285
+ "Standard Malay": "zsm_Latn",
286
+ "Zulu": "zul_Latn",
287
+ }
modules/translation/translation_base.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ from abc import ABC, abstractmethod
5
+ from typing import List
6
+ from datetime import datetime
7
+
8
+ from modules.whisper.whisper_parameter import *
9
+ from modules.utils.subtitle_manager import *
10
+ from modules.utils.files_manager import load_yaml, save_yaml
11
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
12
+
13
+
14
+ class TranslationBase(ABC):
15
+ def __init__(self,
16
+ model_dir: str = NLLB_MODELS_DIR,
17
+ output_dir: str = TRANSLATION_OUTPUT_DIR
18
+ ):
19
+ super().__init__()
20
+ self.model = None
21
+ self.model_dir = model_dir
22
+ self.output_dir = output_dir
23
+ os.makedirs(self.model_dir, exist_ok=True)
24
+ os.makedirs(self.output_dir, exist_ok=True)
25
+ self.current_model_size = None
26
+ self.device = self.get_device()
27
+
28
+ @abstractmethod
29
+ def translate(self,
30
+ text: str,
31
+ max_length: int
32
+ ):
33
+ pass
34
+
35
+ @abstractmethod
36
+ def update_model(self,
37
+ model_size: str,
38
+ src_lang: str,
39
+ tgt_lang: str,
40
+ progress: gr.Progress = gr.Progress()
41
+ ):
42
+ pass
43
+
44
+ def translate_file(self,
45
+ fileobjs: list,
46
+ model_size: str,
47
+ src_lang: str,
48
+ tgt_lang: str,
49
+ max_length: int = 200,
50
+ add_timestamp: bool = True,
51
+ progress=gr.Progress()) -> list:
52
+ """
53
+ Translate subtitle file from source language to target language
54
+
55
+ Parameters
56
+ ----------
57
+ fileobjs: list
58
+ List of files to transcribe from gr.Files()
59
+ model_size: str
60
+ Whisper model size from gr.Dropdown()
61
+ src_lang: str
62
+ Source language of the file to translate from gr.Dropdown()
63
+ tgt_lang: str
64
+ Target language of the file to translate from gr.Dropdown()
65
+ max_length: int
66
+ Max length per line to translate
67
+ add_timestamp: bool
68
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
69
+ progress: gr.Progress
70
+ Indicator to show progress directly in gradio.
71
+ I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
72
+
73
+ Returns
74
+ ----------
75
+ A List of
76
+ String to return to gr.Textbox()
77
+ Files to return to gr.Files()
78
+ """
79
+ try:
80
+ if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
81
+ fileobjs = [file.name for file in fileobjs]
82
+
83
+ self.cache_parameters(model_size=model_size,
84
+ src_lang=src_lang,
85
+ tgt_lang=tgt_lang,
86
+ max_length=max_length,
87
+ add_timestamp=add_timestamp)
88
+
89
+ self.update_model(model_size=model_size,
90
+ src_lang=src_lang,
91
+ tgt_lang=tgt_lang,
92
+ progress=progress)
93
+
94
+ files_info = {}
95
+ for fileobj in fileobjs:
96
+ file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
97
+ if file_ext == ".srt":
98
+ parsed_dicts = parse_srt(file_path=fileobj)
99
+ total_progress = len(parsed_dicts)
100
+ for index, dic in enumerate(parsed_dicts):
101
+ progress(index / total_progress, desc="Translating..")
102
+ translated_text = self.translate(dic["sentence"], max_length=max_length)
103
+ dic["sentence"] = translated_text
104
+ subtitle = get_serialized_srt(parsed_dicts)
105
+
106
+ elif file_ext == ".vtt":
107
+ parsed_dicts = parse_vtt(file_path=fileobj)
108
+ total_progress = len(parsed_dicts)
109
+ for index, dic in enumerate(parsed_dicts):
110
+ progress(index / total_progress, desc="Translating..")
111
+ translated_text = self.translate(dic["sentence"], max_length=max_length)
112
+ dic["sentence"] = translated_text
113
+ subtitle = get_serialized_vtt(parsed_dicts)
114
+
115
+ if add_timestamp:
116
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
117
+ file_name += f"-{timestamp}"
118
+
119
+ output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
120
+ write_file(subtitle, output_path)
121
+
122
+ files_info[file_name] = {"subtitle": subtitle, "path": output_path}
123
+
124
+ total_result = ''
125
+ for file_name, info in files_info.items():
126
+ total_result += '------------------------------------\n'
127
+ total_result += f'{file_name}\n\n'
128
+ total_result += f'{info["subtitle"]}'
129
+ gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
130
+
131
+ output_file_paths = [item["path"] for key, item in files_info.items()]
132
+ return [gr_str, output_file_paths]
133
+
134
+ except Exception as e:
135
+ print(f"Error: {str(e)}")
136
+ finally:
137
+ self.release_cuda_memory()
138
+
139
+ @staticmethod
140
+ def get_device():
141
+ if torch.cuda.is_available():
142
+ return "cuda"
143
+ elif torch.backends.mps.is_available():
144
+ return "mps"
145
+ else:
146
+ return "cpu"
147
+
148
+ @staticmethod
149
+ def release_cuda_memory():
150
+ if torch.cuda.is_available():
151
+ torch.cuda.empty_cache()
152
+ torch.cuda.reset_max_memory_allocated()
153
+
154
+ @staticmethod
155
+ def remove_input_files(file_paths: List[str]):
156
+ if not file_paths:
157
+ return
158
+
159
+ for file_path in file_paths:
160
+ if file_path and os.path.exists(file_path):
161
+ os.remove(file_path)
162
+
163
+ @staticmethod
164
+ def cache_parameters(model_size: str,
165
+ src_lang: str,
166
+ tgt_lang: str,
167
+ max_length: int,
168
+ add_timestamp: bool):
169
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
170
+ cached_params["translation"]["nllb"] = {
171
+ "model_size": model_size,
172
+ "source_lang": src_lang,
173
+ "target_lang": tgt_lang,
174
+ "max_length": max_length,
175
+ }
176
+ cached_params["translation"]["add_timestamp"] = add_timestamp
177
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/ui/__init__.py ADDED
File without changes
modules/ui/htmls.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CSS = """
2
+ .bmc-button {
3
+ padding: 2px 5px;
4
+ border-radius: 5px;
5
+ background-color: #FF813F;
6
+ color: white;
7
+ box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
8
+ text-decoration: none;
9
+ display: inline-block;
10
+ font-size: 20px;
11
+ margin: 2px;
12
+ cursor: pointer;
13
+ -webkit-transition: background-color 0.3s ease;
14
+ -ms-transition: background-color 0.3s ease;
15
+ transition: background-color 0.3s ease;
16
+ }
17
+ .bmc-button:hover,
18
+ .bmc-button:active,
19
+ .bmc-button:focus {
20
+ background-color: #FF5633;
21
+ }
22
+ .markdown {
23
+ margin-bottom: 0;
24
+ padding-bottom: 0;
25
+ }
26
+ .tabs {
27
+ margin-top: 0;
28
+ padding-top: 0;
29
+ }
30
+
31
+ #md_project a {
32
+ color: black;
33
+ text-decoration: none;
34
+ }
35
+ #md_project a:hover {
36
+ text-decoration: underline;
37
+ }
38
+ """
39
+
40
+ MARKDOWN = """
41
+ # Automatic speech recognition
42
+ """
43
+
44
+
45
+ NLLB_VRAM_TABLE = """
46
+ <!DOCTYPE html>
47
+ <html lang="en">
48
+ <head>
49
+ <meta charset="UTF-8">
50
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
51
+ <style>
52
+ table {
53
+ border-collapse: collapse;
54
+ width: 100%;
55
+ }
56
+ th, td {
57
+ border: 1px solid #dddddd;
58
+ text-align: left;
59
+ padding: 8px;
60
+ }
61
+ th {
62
+ background-color: #f2f2f2;
63
+ }
64
+ </style>
65
+ </head>
66
+ <body>
67
+
68
+ <details>
69
+ <summary>VRAM usage for each model</summary>
70
+ <table>
71
+ <thead>
72
+ <tr>
73
+ <th>Model name</th>
74
+ <th>Required VRAM</th>
75
+ </tr>
76
+ </thead>
77
+ <tbody>
78
+ <tr>
79
+ <td>nllb-200-3.3B</td>
80
+ <td>~16GB</td>
81
+ </tr>
82
+ <tr>
83
+ <td>nllb-200-1.3B</td>
84
+ <td>~8GB</td>
85
+ </tr>
86
+ <tr>
87
+ <td>nllb-200-distilled-600M</td>
88
+ <td>~4GB</td>
89
+ </tr>
90
+ </tbody>
91
+ </table>
92
+ <p><strong>Note:</strong> Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.</p>
93
+ </details>
94
+
95
+ </body>
96
+ </html>
97
+ """
modules/utils/__init__.py ADDED
File without changes
modules/utils/cli_manager.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ def str2bool(v):
5
+ if isinstance(v, bool):
6
+ return v
7
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
8
+ return True
9
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
10
+ return False
11
+ else:
12
+ raise argparse.ArgumentTypeError('Boolean value expected.')
modules/utils/files_manager.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fnmatch
3
+ from ruamel.yaml import YAML
4
+ from gradio.utils import NamedString
5
+
6
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
7
+
8
+
9
+ def load_yaml(path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
10
+ yaml = YAML(typ="safe")
11
+ yaml.preserve_quotes = True
12
+ with open(path, 'r', encoding='utf-8') as file:
13
+ config = yaml.load(file)
14
+ return config
15
+
16
+
17
+ def save_yaml(data: dict, path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
18
+ yaml = YAML(typ="safe")
19
+ yaml.map_indent = 2
20
+ yaml.sequence_indent = 4
21
+ yaml.sequence_dash_offset = 2
22
+ yaml.preserve_quotes = True
23
+ yaml.default_flow_style = False
24
+ yaml.sort_base_mapping_type_on_output = False
25
+
26
+ with open(path, 'w', encoding='utf-8') as file:
27
+ yaml.dump(data, file)
28
+ return path
29
+
30
+
31
+ def get_media_files(folder_path, include_sub_directory=False):
32
+ video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv', '*.webm', '*.m4v', '*.mpeg', '*.mpg',
33
+ '*.3gp', '*.f4v', '*.ogv', '*.vob', '*.mts', '*.m2ts', '*.divx', '*.mxf', '*.rm', '*.rmvb']
34
+ audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
35
+ media_extensions = video_extensions + audio_extensions
36
+
37
+ media_files = []
38
+
39
+ if include_sub_directory:
40
+ for root, _, files in os.walk(folder_path):
41
+ for extension in media_extensions:
42
+ media_files.extend(
43
+ os.path.join(root, file) for file in fnmatch.filter(files, extension)
44
+ if os.path.exists(os.path.join(root, file))
45
+ )
46
+ else:
47
+ for extension in media_extensions:
48
+ media_files.extend(
49
+ os.path.join(folder_path, file) for file in fnmatch.filter(os.listdir(folder_path), extension)
50
+ if os.path.isfile(os.path.join(folder_path, file)) and os.path.exists(os.path.join(folder_path, file))
51
+ )
52
+
53
+ return media_files
54
+
55
+
56
+ def format_gradio_files(files: list):
57
+ if not files:
58
+ return files
59
+
60
+ gradio_files = []
61
+ for file in files:
62
+ gradio_files.append(NamedString(file))
63
+ return gradio_files
64
+
65
+
66
+ def is_video(file_path):
67
+ video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
68
+ extension = os.path.splitext(file_path)[1].lower()
69
+ return extension in video_extensions
modules/utils/paths.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ WEBUI_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
4
+ MODELS_DIR = os.path.join(WEBUI_DIR, "models")
5
+ WHISPER_MODELS_DIR = os.path.join(MODELS_DIR, "Whisper")
6
+ FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
7
+ INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
8
+ NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
9
+ DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
10
+ UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
+ CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
+ DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
13
+ OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
14
+ TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
15
+ UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
16
+ UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
17
+ UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
18
+
19
+ for dir_path in [MODELS_DIR,
20
+ WHISPER_MODELS_DIR,
21
+ FASTER_WHISPER_MODELS_DIR,
22
+ INSANELY_FAST_WHISPER_MODELS_DIR,
23
+ NLLB_MODELS_DIR,
24
+ DIARIZATION_MODELS_DIR,
25
+ UVR_MODELS_DIR,
26
+ CONFIGS_DIR,
27
+ OUTPUT_DIR,
28
+ TRANSLATION_OUTPUT_DIR,
29
+ UVR_INSTRUMENTAL_OUTPUT_DIR,
30
+ UVR_VOCALS_OUTPUT_DIR]:
31
+ os.makedirs(dir_path, exist_ok=True)
modules/utils/subtitle_manager.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def timeformat_srt(time):
5
+ hours = time // 3600
6
+ minutes = (time - hours * 3600) // 60
7
+ seconds = time - hours * 3600 - minutes * 60
8
+ milliseconds = (time - int(time)) * 1000
9
+ return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
10
+
11
+
12
+ def timeformat_vtt(time):
13
+ hours = time // 3600
14
+ minutes = (time - hours * 3600) // 60
15
+ seconds = time - hours * 3600 - minutes * 60
16
+ milliseconds = (time - int(time)) * 1000
17
+ return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
18
+
19
+
20
+ def write_file(subtitle, output_file):
21
+ with open(output_file, 'w', encoding='utf-8') as f:
22
+ f.write(subtitle)
23
+
24
+
25
+ def get_srt(segments):
26
+ output = ""
27
+ for i, segment in enumerate(segments):
28
+ output += f"{i + 1}\n"
29
+ output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
30
+ if segment['text'].startswith(' '):
31
+ segment['text'] = segment['text'][1:]
32
+ output += f"{segment['text']}\n\n"
33
+ return output
34
+
35
+
36
+ def get_vtt(segments):
37
+ output = "WebVTT\n\n"
38
+ for i, segment in enumerate(segments):
39
+ output += f"{i + 1}\n"
40
+ output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
41
+ if segment['text'].startswith(' '):
42
+ segment['text'] = segment['text'][1:]
43
+ output += f"{segment['text']}\n\n"
44
+ return output
45
+
46
+
47
+ def get_txt(segments):
48
+ output = ""
49
+ for i, segment in enumerate(segments):
50
+ if segment['text'].startswith(' '):
51
+ segment['text'] = segment['text'][1:]
52
+ output += f"{segment['text']}\n"
53
+ return output
54
+
55
+
56
+ def parse_srt(file_path):
57
+ """Reads SRT file and returns as dict"""
58
+ with open(file_path, 'r', encoding='utf-8') as file:
59
+ srt_data = file.read()
60
+
61
+ data = []
62
+ blocks = srt_data.split('\n\n')
63
+
64
+ for block in blocks:
65
+ if block.strip() != '':
66
+ lines = block.strip().split('\n')
67
+ index = lines[0]
68
+ timestamp = lines[1]
69
+ sentence = ' '.join(lines[2:])
70
+
71
+ data.append({
72
+ "index": index,
73
+ "timestamp": timestamp,
74
+ "sentence": sentence
75
+ })
76
+ return data
77
+
78
+
79
+ def parse_vtt(file_path):
80
+ """Reads WebVTT file and returns as dict"""
81
+ with open(file_path, 'r', encoding='utf-8') as file:
82
+ webvtt_data = file.read()
83
+
84
+ data = []
85
+ blocks = webvtt_data.split('\n\n')
86
+
87
+ for block in blocks:
88
+ if block.strip() != '' and not block.strip().startswith("WebVTT"):
89
+ lines = block.strip().split('\n')
90
+ index = lines[0]
91
+ timestamp = lines[1]
92
+ sentence = ' '.join(lines[2:])
93
+
94
+ data.append({
95
+ "index": index,
96
+ "timestamp": timestamp,
97
+ "sentence": sentence
98
+ })
99
+
100
+ return data
101
+
102
+
103
+ def get_serialized_srt(dicts):
104
+ output = ""
105
+ for dic in dicts:
106
+ output += f'{dic["index"]}\n'
107
+ output += f'{dic["timestamp"]}\n'
108
+ output += f'{dic["sentence"]}\n\n'
109
+ return output
110
+
111
+
112
+ def get_serialized_vtt(dicts):
113
+ output = "WebVTT\n\n"
114
+ for dic in dicts:
115
+ output += f'{dic["index"]}\n'
116
+ output += f'{dic["timestamp"]}\n'
117
+ output += f'{dic["sentence"]}\n\n'
118
+ return output
119
+
120
+
121
+ def safe_filename(name):
122
+ INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
123
+ safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
124
+ # Truncate the filename if it exceeds the max_length (20)
125
+ if len(safe_name) > 20:
126
+ file_extension = safe_name.split('.')[-1]
127
+ if len(file_extension) + 1 < 20:
128
+ truncated_name = safe_name[:20 - len(file_extension) - 1]
129
+ safe_name = truncated_name + '.' + file_extension
130
+ else:
131
+ safe_name = safe_name[:20]
132
+ return safe_name
modules/utils/youtube_manager.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytubefix import YouTube
2
+ import subprocess
3
+ import os
4
+
5
+
6
+ def get_ytdata(link):
7
+ return YouTube(link)
8
+
9
+
10
+ def get_ytmetas(link):
11
+ yt = YouTube(link)
12
+ return yt.thumbnail_url, yt.title, yt.description
13
+
14
+
15
+ def get_ytaudio(ytdata: YouTube):
16
+ # Somehow the audio is corrupted so need to convert to valid audio file.
17
+ # Fix for : https://github.com/jhj0517/Whisper-WebUI/issues/304
18
+
19
+ audio_path = ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))
20
+ temp_audio_path = os.path.join("modules", "yt_tmp_fixed.wav")
21
+
22
+ try:
23
+ subprocess.run([
24
+ 'ffmpeg', '-y',
25
+ '-i', audio_path,
26
+ temp_audio_path
27
+ ], check=True)
28
+
29
+ os.replace(temp_audio_path, audio_path)
30
+ return audio_path
31
+ except subprocess.CalledProcessError as e:
32
+ print(f"Error during ffmpeg conversion: {e}")
33
+ return None
modules/uvr/music_separator.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union, List, Dict
2
+ import numpy as np
3
+ import torchaudio
4
+ import soundfile as sf
5
+ import os
6
+ import torch
7
+ import gc
8
+ import gradio as gr
9
+ from datetime import datetime
10
+
11
+ from uvr.models import MDX, Demucs, VrNetwork, MDXC
12
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
13
+ from modules.utils.files_manager import load_yaml, save_yaml, is_video
14
+ from modules.diarize.audio_loader import load_audio
15
+
16
+ class MusicSeparator:
17
+ def __init__(self,
18
+ model_dir: Optional[str] = None,
19
+ output_dir: Optional[str] = None):
20
+ self.model = None
21
+ self.device = self.get_device()
22
+ self.available_devices = ["cpu", "cuda"]
23
+ self.model_dir = model_dir
24
+ self.output_dir = output_dir
25
+ instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
26
+ vocals_output_dir = os.path.join(self.output_dir, "vocals")
27
+ os.makedirs(instrumental_output_dir, exist_ok=True)
28
+ os.makedirs(vocals_output_dir, exist_ok=True)
29
+ self.audio_info = None
30
+ self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
31
+ self.default_model = self.available_models[0]
32
+ self.current_model_size = self.default_model
33
+ self.model_config = {
34
+ "segment": 256,
35
+ "split": True
36
+ }
37
+
38
+ def update_model(self,
39
+ model_name: str = "UVR-MDX-NET-Inst_1",
40
+ device: Optional[str] = None,
41
+ segment_size: int = 256):
42
+ """
43
+ Update model with the given model name
44
+
45
+ Args:
46
+ model_name (str): Model name.
47
+ device (str): Device to use for the model.
48
+ segment_size (int): Segment size for the prediction.
49
+ """
50
+ if device is None:
51
+ device = self.device
52
+
53
+ self.device = device
54
+ self.model_config = {
55
+ "segment": segment_size,
56
+ "split": True
57
+ }
58
+ self.model = MDX(name=model_name,
59
+ other_metadata=self.model_config,
60
+ device=self.device,
61
+ logger=None,
62
+ model_dir=self.model_dir)
63
+
64
+ def separate(self,
65
+ audio: Union[str, np.ndarray],
66
+ model_name: str,
67
+ device: Optional[str] = None,
68
+ segment_size: int = 256,
69
+ save_file: bool = False,
70
+ progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
71
+ """
72
+ Separate the background music from the audio.
73
+
74
+ Args:
75
+ audio (Union[str, np.ndarray]): Audio path or numpy array.
76
+ model_name (str): Model name.
77
+ device (str): Device to use for the model.
78
+ segment_size (int): Segment size for the prediction.
79
+ save_file (bool): Whether to save the separated audio to output path or not.
80
+ progress (gr.Progress): Gradio progress indicator.
81
+
82
+ Returns:
83
+ A Tuple of
84
+ np.ndarray: Instrumental numpy arrays.
85
+ np.ndarray: Vocals numpy arrays.
86
+ file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
87
+ """
88
+ if isinstance(audio, str):
89
+ output_filename, ext = os.path.basename(audio), ".wav"
90
+ output_filename, orig_ext = os.path.splitext(output_filename)
91
+
92
+ if is_video(audio):
93
+ audio = load_audio(audio)
94
+ sample_rate = 16000
95
+ else:
96
+ self.audio_info = torchaudio.info(audio)
97
+ sample_rate = self.audio_info.sample_rate
98
+ else:
99
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
100
+ output_filename, ext = f"UVR-{timestamp}", ".wav"
101
+ sample_rate = 16000
102
+
103
+ model_config = {
104
+ "segment": segment_size,
105
+ "split": True
106
+ }
107
+
108
+ if (self.model is None or
109
+ self.current_model_size != model_name or
110
+ self.model_config != model_config or
111
+ self.model.sample_rate != sample_rate or
112
+ self.device != device):
113
+ progress(0, desc="Initializing UVR Model..")
114
+ self.update_model(
115
+ model_name=model_name,
116
+ device=device,
117
+ segment_size=segment_size
118
+ )
119
+ self.model.sample_rate = sample_rate
120
+
121
+ progress(0, desc="Separating background music from the audio..")
122
+ result = self.model(audio)
123
+ instrumental, vocals = result["instrumental"].T, result["vocals"].T
124
+
125
+ file_paths = []
126
+ if save_file:
127
+ instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
128
+ vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
129
+ sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
130
+ sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
131
+ file_paths += [instrumental_output_path, vocals_output_path]
132
+
133
+ return instrumental, vocals, file_paths
134
+
135
+ def separate_files(self,
136
+ files: List,
137
+ model_name: str,
138
+ device: Optional[str] = None,
139
+ segment_size: int = 256,
140
+ save_file: bool = True,
141
+ progress: gr.Progress = gr.Progress()) -> List[str]:
142
+ """Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
143
+ to display into gr.Audio()"""
144
+ self.cache_parameters(model_size=model_name, segment_size=segment_size)
145
+
146
+ for file_path in files:
147
+ instrumental, vocals, file_paths = self.separate(
148
+ audio=file_path,
149
+ model_name=model_name,
150
+ device=device,
151
+ segment_size=segment_size,
152
+ save_file=save_file,
153
+ progress=progress
154
+ )
155
+ return file_paths
156
+
157
+ @staticmethod
158
+ def get_device():
159
+ """Get device for the model"""
160
+ return "cuda" if torch.cuda.is_available() else "cpu"
161
+
162
+ def offload(self):
163
+ """Offload the model and free up the memory"""
164
+ if self.model is not None:
165
+ del self.model
166
+ self.model = None
167
+ if self.device == "cuda":
168
+ torch.cuda.empty_cache()
169
+ gc.collect()
170
+ self.audio_info = None
171
+
172
+ @staticmethod
173
+ def cache_parameters(model_size: str,
174
+ segment_size: int):
175
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
176
+ cached_uvr_params = cached_params["bgm_separation"]
177
+ uvr_params_to_cache = {
178
+ "model_size": model_size,
179
+ "segment_size": segment_size
180
+ }
181
+ cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
182
+ cached_params["bgm_separation"] = cached_uvr_params
183
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/vad/__init__.py ADDED
File without changes
modules/vad/silero_vad.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
2
+
3
+ from faster_whisper.vad import VadOptions, get_vad_model
4
+ import numpy as np
5
+ from typing import BinaryIO, Union, List, Optional, Tuple
6
+ import warnings
7
+ import faster_whisper
8
+ from faster_whisper.transcribe import SpeechTimestampsMap, Segment
9
+ import gradio as gr
10
+
11
+
12
+ class SileroVAD:
13
+ def __init__(self):
14
+ self.sampling_rate = 16000
15
+ self.window_size_samples = 512
16
+ self.model = None
17
+
18
+ def run(self,
19
+ audio: Union[str, BinaryIO, np.ndarray],
20
+ vad_parameters: VadOptions,
21
+ progress: gr.Progress = gr.Progress()
22
+ ) -> Tuple[np.ndarray, List[dict]]:
23
+ """
24
+ Run VAD
25
+
26
+ Parameters
27
+ ----------
28
+ audio: Union[str, BinaryIO, np.ndarray]
29
+ Audio path or file binary or Audio numpy array
30
+ vad_parameters:
31
+ Options for VAD processing.
32
+ progress: gr.Progress
33
+ Indicator to show progress directly in gradio.
34
+
35
+ Returns
36
+ ----------
37
+ np.ndarray
38
+ Pre-processed audio with VAD
39
+ List[dict]
40
+ Chunks of speeches to be used to restore the timestamps later
41
+ """
42
+
43
+ sampling_rate = self.sampling_rate
44
+
45
+ if not isinstance(audio, np.ndarray):
46
+ audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
47
+
48
+ duration = audio.shape[0] / sampling_rate
49
+ duration_after_vad = duration
50
+
51
+ if vad_parameters is None:
52
+ vad_parameters = VadOptions()
53
+ elif isinstance(vad_parameters, dict):
54
+ vad_parameters = VadOptions(**vad_parameters)
55
+ speech_chunks = self.get_speech_timestamps(
56
+ audio=audio,
57
+ vad_options=vad_parameters,
58
+ progress=progress
59
+ )
60
+ audio = self.collect_chunks(audio, speech_chunks)
61
+ duration_after_vad = audio.shape[0] / sampling_rate
62
+
63
+ return audio, speech_chunks
64
+
65
+ def get_speech_timestamps(
66
+ self,
67
+ audio: np.ndarray,
68
+ vad_options: Optional[VadOptions] = None,
69
+ progress: gr.Progress = gr.Progress(),
70
+ **kwargs,
71
+ ) -> List[dict]:
72
+ """This method is used for splitting long audios into speech chunks using silero VAD.
73
+
74
+ Args:
75
+ audio: One dimensional float array.
76
+ vad_options: Options for VAD processing.
77
+ kwargs: VAD options passed as keyword arguments for backward compatibility.
78
+ progress: Gradio progress to indicate progress.
79
+
80
+ Returns:
81
+ List of dicts containing begin and end samples of each speech chunk.
82
+ """
83
+
84
+ if self.model is None:
85
+ self.update_model()
86
+
87
+ if vad_options is None:
88
+ vad_options = VadOptions(**kwargs)
89
+
90
+ threshold = vad_options.threshold
91
+ min_speech_duration_ms = vad_options.min_speech_duration_ms
92
+ max_speech_duration_s = vad_options.max_speech_duration_s
93
+ min_silence_duration_ms = vad_options.min_silence_duration_ms
94
+ window_size_samples = self.window_size_samples
95
+ speech_pad_ms = vad_options.speech_pad_ms
96
+ sampling_rate = 16000
97
+ min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
98
+ speech_pad_samples = sampling_rate * speech_pad_ms / 1000
99
+ max_speech_samples = (
100
+ sampling_rate * max_speech_duration_s
101
+ - window_size_samples
102
+ - 2 * speech_pad_samples
103
+ )
104
+ min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
105
+ min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
106
+
107
+ audio_length_samples = len(audio)
108
+
109
+ state, context = self.model.get_initial_states(batch_size=1)
110
+
111
+ speech_probs = []
112
+ for current_start_sample in range(0, audio_length_samples, window_size_samples):
113
+ progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
114
+
115
+ chunk = audio[current_start_sample: current_start_sample + window_size_samples]
116
+ if len(chunk) < window_size_samples:
117
+ chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
118
+ speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
119
+ speech_probs.append(speech_prob)
120
+
121
+ triggered = False
122
+ speeches = []
123
+ current_speech = {}
124
+ neg_threshold = threshold - 0.15
125
+
126
+ # to save potential segment end (and tolerate some silence)
127
+ temp_end = 0
128
+ # to save potential segment limits in case of maximum segment size reached
129
+ prev_end = next_start = 0
130
+
131
+ for i, speech_prob in enumerate(speech_probs):
132
+ if (speech_prob >= threshold) and temp_end:
133
+ temp_end = 0
134
+ if next_start < prev_end:
135
+ next_start = window_size_samples * i
136
+
137
+ if (speech_prob >= threshold) and not triggered:
138
+ triggered = True
139
+ current_speech["start"] = window_size_samples * i
140
+ continue
141
+
142
+ if (
143
+ triggered
144
+ and (window_size_samples * i) - current_speech["start"] > max_speech_samples
145
+ ):
146
+ if prev_end:
147
+ current_speech["end"] = prev_end
148
+ speeches.append(current_speech)
149
+ current_speech = {}
150
+ # previously reached silence (< neg_thres) and is still not speech (< thres)
151
+ if next_start < prev_end:
152
+ triggered = False
153
+ else:
154
+ current_speech["start"] = next_start
155
+ prev_end = next_start = temp_end = 0
156
+ else:
157
+ current_speech["end"] = window_size_samples * i
158
+ speeches.append(current_speech)
159
+ current_speech = {}
160
+ prev_end = next_start = temp_end = 0
161
+ triggered = False
162
+ continue
163
+
164
+ if (speech_prob < neg_threshold) and triggered:
165
+ if not temp_end:
166
+ temp_end = window_size_samples * i
167
+ # condition to avoid cutting in very short silence
168
+ if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
169
+ prev_end = temp_end
170
+ if (window_size_samples * i) - temp_end < min_silence_samples:
171
+ continue
172
+ else:
173
+ current_speech["end"] = temp_end
174
+ if (
175
+ current_speech["end"] - current_speech["start"]
176
+ ) > min_speech_samples:
177
+ speeches.append(current_speech)
178
+ current_speech = {}
179
+ prev_end = next_start = temp_end = 0
180
+ triggered = False
181
+ continue
182
+
183
+ if (
184
+ current_speech
185
+ and (audio_length_samples - current_speech["start"]) > min_speech_samples
186
+ ):
187
+ current_speech["end"] = audio_length_samples
188
+ speeches.append(current_speech)
189
+
190
+ for i, speech in enumerate(speeches):
191
+ if i == 0:
192
+ speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
193
+ if i != len(speeches) - 1:
194
+ silence_duration = speeches[i + 1]["start"] - speech["end"]
195
+ if silence_duration < 2 * speech_pad_samples:
196
+ speech["end"] += int(silence_duration // 2)
197
+ speeches[i + 1]["start"] = int(
198
+ max(0, speeches[i + 1]["start"] - silence_duration // 2)
199
+ )
200
+ else:
201
+ speech["end"] = int(
202
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
203
+ )
204
+ speeches[i + 1]["start"] = int(
205
+ max(0, speeches[i + 1]["start"] - speech_pad_samples)
206
+ )
207
+ else:
208
+ speech["end"] = int(
209
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
210
+ )
211
+
212
+ return speeches
213
+
214
+ def update_model(self):
215
+ self.model = get_vad_model()
216
+
217
+ @staticmethod
218
+ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
219
+ """Collects and concatenates audio chunks."""
220
+ if not chunks:
221
+ return np.array([], dtype=np.float32)
222
+
223
+ return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
224
+
225
+ @staticmethod
226
+ def format_timestamp(
227
+ seconds: float,
228
+ always_include_hours: bool = False,
229
+ decimal_marker: str = ".",
230
+ ) -> str:
231
+ assert seconds >= 0, "non-negative timestamp expected"
232
+ milliseconds = round(seconds * 1000.0)
233
+
234
+ hours = milliseconds // 3_600_000
235
+ milliseconds -= hours * 3_600_000
236
+
237
+ minutes = milliseconds // 60_000
238
+ milliseconds -= minutes * 60_000
239
+
240
+ seconds = milliseconds // 1_000
241
+ milliseconds -= seconds * 1_000
242
+
243
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
244
+ return (
245
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
246
+ )
247
+
248
+ def restore_speech_timestamps(
249
+ self,
250
+ segments: List[dict],
251
+ speech_chunks: List[dict],
252
+ sampling_rate: Optional[int] = None,
253
+ ) -> List[dict]:
254
+ if sampling_rate is None:
255
+ sampling_rate = self.sampling_rate
256
+
257
+ ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
258
+
259
+ for segment in segments:
260
+ segment["start"] = ts_map.get_original_time(segment["start"])
261
+ segment["end"] = ts_map.get_original_time(segment["end"])
262
+
263
+ return segments
264
+
modules/whisper/__init__.py ADDED
File without changes
modules/whisper/faster_whisper_inference.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import numpy as np
4
+ import torch
5
+ from typing import BinaryIO, Union, Tuple, List
6
+ import faster_whisper
7
+ from faster_whisper.vad import VadOptions
8
+ import ast
9
+ import ctranslate2
10
+ import whisper
11
+ import gradio as gr
12
+ from argparse import Namespace
13
+
14
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
+ from modules.whisper.whisper_parameter import *
16
+ from modules.whisper.whisper_base import WhisperBase
17
+
18
+
19
+ class FasterWhisperInference(WhisperBase):
20
+ def __init__(self,
21
+ model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
+ output_dir: str = OUTPUT_DIR,
25
+ ):
26
+ super().__init__(
27
+ model_dir=model_dir,
28
+ diarization_model_dir=diarization_model_dir,
29
+ uvr_model_dir=uvr_model_dir,
30
+ output_dir=output_dir
31
+ )
32
+ self.model_dir = model_dir
33
+ os.makedirs(self.model_dir, exist_ok=True)
34
+
35
+ self.model_paths = self.get_model_paths()
36
+ self.device = self.get_device()
37
+ self.available_models = self.model_paths.keys()
38
+ self.available_compute_types = ctranslate2.get_supported_compute_types(
39
+ "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
40
+
41
+ def transcribe(self,
42
+ audio: Union[str, BinaryIO, np.ndarray],
43
+ progress: gr.Progress = gr.Progress(),
44
+ *whisper_params,
45
+ ) -> Tuple[List[dict], float]:
46
+ """
47
+ transcribe method for faster-whisper.
48
+
49
+ Parameters
50
+ ----------
51
+ audio: Union[str, BinaryIO, np.ndarray]
52
+ Audio path or file binary or Audio numpy array
53
+ progress: gr.Progress
54
+ Indicator to show progress directly in gradio.
55
+ *whisper_params: tuple
56
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
57
+
58
+ Returns
59
+ ----------
60
+ segments_result: List[dict]
61
+ list of dicts that includes start, end timestamps and transcribed text
62
+ elapsed_time: float
63
+ elapsed time for transcription
64
+ """
65
+ start_time = time.time()
66
+
67
+ params = WhisperParameters.as_value(*whisper_params)
68
+
69
+ if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
70
+ self.update_model(params.model_size, params.compute_type, progress)
71
+
72
+ # None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
73
+ if not params.initial_prompt:
74
+ params.initial_prompt = None
75
+ if not params.prefix:
76
+ params.prefix = None
77
+ if not params.hotwords:
78
+ params.hotwords = None
79
+
80
+ params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
81
+
82
+ segments, info = self.model.transcribe(
83
+ audio=audio,
84
+ language=params.lang,
85
+ task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
86
+ beam_size=params.beam_size,
87
+ log_prob_threshold=params.log_prob_threshold,
88
+ no_speech_threshold=params.no_speech_threshold,
89
+ best_of=params.best_of,
90
+ patience=params.patience,
91
+ temperature=params.temperature,
92
+ initial_prompt=params.initial_prompt,
93
+ compression_ratio_threshold=params.compression_ratio_threshold,
94
+ length_penalty=params.length_penalty,
95
+ repetition_penalty=params.repetition_penalty,
96
+ no_repeat_ngram_size=params.no_repeat_ngram_size,
97
+ prefix=params.prefix,
98
+ suppress_blank=params.suppress_blank,
99
+ suppress_tokens=params.suppress_tokens,
100
+ max_initial_timestamp=params.max_initial_timestamp,
101
+ word_timestamps=params.word_timestamps,
102
+ prepend_punctuations=params.prepend_punctuations,
103
+ append_punctuations=params.append_punctuations,
104
+ max_new_tokens=params.max_new_tokens,
105
+ chunk_length=params.chunk_length,
106
+ hallucination_silence_threshold=params.hallucination_silence_threshold,
107
+ hotwords=params.hotwords,
108
+ language_detection_threshold=params.language_detection_threshold,
109
+ language_detection_segments=params.language_detection_segments,
110
+ prompt_reset_on_temperature=params.prompt_reset_on_temperature,
111
+ )
112
+ progress(0, desc="Loading audio..")
113
+
114
+ segments_result = []
115
+ for segment in segments:
116
+ progress(segment.start / info.duration, desc="Transcribing..")
117
+ segments_result.append({
118
+ "start": segment.start,
119
+ "end": segment.end,
120
+ "text": segment.text
121
+ })
122
+
123
+ elapsed_time = time.time() - start_time
124
+ return segments_result, elapsed_time
125
+
126
+ def update_model(self,
127
+ model_size: str,
128
+ compute_type: str,
129
+ progress: gr.Progress = gr.Progress()
130
+ ):
131
+ """
132
+ Update current model setting
133
+
134
+ Parameters
135
+ ----------
136
+ model_size: str
137
+ Size of whisper model
138
+ compute_type: str
139
+ Compute type for transcription.
140
+ see more info : https://opennmt.net/CTranslate2/quantization.html
141
+ progress: gr.Progress
142
+ Indicator to show progress directly in gradio.
143
+ """
144
+ progress(0, desc="Initializing Model..")
145
+ self.current_model_size = self.model_paths[model_size]
146
+ self.current_compute_type = compute_type
147
+ self.model = faster_whisper.WhisperModel(
148
+ device=self.device,
149
+ model_size_or_path=self.current_model_size,
150
+ download_root=self.model_dir,
151
+ compute_type=self.current_compute_type
152
+ )
153
+
154
+ def get_model_paths(self):
155
+ """
156
+ Get available models from models path including fine-tuned model.
157
+
158
+ Returns
159
+ ----------
160
+ Name list of models
161
+ """
162
+ model_paths = {model:model for model in faster_whisper.available_models()}
163
+ faster_whisper_prefix = "models--Systran--faster-whisper-"
164
+
165
+ existing_models = os.listdir(self.model_dir)
166
+ wrong_dirs = [".locks"]
167
+ existing_models = list(set(existing_models) - set(wrong_dirs))
168
+
169
+ for model_name in existing_models:
170
+ if faster_whisper_prefix in model_name:
171
+ model_name = model_name[len(faster_whisper_prefix):]
172
+
173
+ if model_name not in whisper.available_models():
174
+ model_paths[model_name] = os.path.join(self.model_dir, model_name)
175
+ return model_paths
176
+
177
+ @staticmethod
178
+ def get_device():
179
+ if torch.cuda.is_available():
180
+ return "cuda"
181
+ else:
182
+ return "auto"
183
+
184
+ @staticmethod
185
+ def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
186
+ try:
187
+ suppress_tokens = ast.literal_eval(suppress_tokens_str)
188
+ if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
189
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
190
+ return suppress_tokens
191
+ except Exception as e:
192
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
modules/whisper/insanely_fast_whisper_inference.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import numpy as np
4
+ from typing import BinaryIO, Union, Tuple, List
5
+ import torch
6
+ from transformers import pipeline
7
+ from transformers.utils import is_flash_attn_2_available
8
+ import gradio as gr
9
+ from huggingface_hub import hf_hub_download
10
+ import whisper
11
+ from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
+ from argparse import Namespace
13
+
14
+ from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
+ from modules.whisper.whisper_parameter import *
16
+ from modules.whisper.whisper_base import WhisperBase
17
+
18
+
19
+ class InsanelyFastWhisperInference(WhisperBase):
20
+ def __init__(self,
21
+ model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
+ output_dir: str = OUTPUT_DIR,
25
+ ):
26
+ super().__init__(
27
+ model_dir=model_dir,
28
+ output_dir=output_dir,
29
+ diarization_model_dir=diarization_model_dir,
30
+ uvr_model_dir=uvr_model_dir
31
+ )
32
+ self.model_dir = model_dir
33
+ os.makedirs(self.model_dir, exist_ok=True)
34
+
35
+ openai_models = whisper.available_models()
36
+ distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
37
+ self.available_models = openai_models + distil_models
38
+ self.available_compute_types = ["float16"]
39
+
40
+ def transcribe(self,
41
+ audio: Union[str, np.ndarray, torch.Tensor],
42
+ progress: gr.Progress = gr.Progress(),
43
+ *whisper_params,
44
+ ) -> Tuple[List[dict], float]:
45
+ """
46
+ transcribe method for faster-whisper.
47
+
48
+ Parameters
49
+ ----------
50
+ audio: Union[str, BinaryIO, np.ndarray]
51
+ Audio path or file binary or Audio numpy array
52
+ progress: gr.Progress
53
+ Indicator to show progress directly in gradio.
54
+ *whisper_params: tuple
55
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
56
+
57
+ Returns
58
+ ----------
59
+ segments_result: List[dict]
60
+ list of dicts that includes start, end timestamps and transcribed text
61
+ elapsed_time: float
62
+ elapsed time for transcription
63
+ """
64
+ start_time = time.time()
65
+ params = WhisperParameters.as_value(*whisper_params)
66
+
67
+ if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
68
+ self.update_model(params.model_size, params.compute_type, progress)
69
+
70
+ progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
71
+ with Progress(
72
+ TextColumn("[progress.description]{task.description}"),
73
+ BarColumn(style="yellow1", pulse_style="white"),
74
+ TimeElapsedColumn(),
75
+ ) as progress:
76
+ progress.add_task("[yellow]Transcribing...", total=None)
77
+
78
+ kwargs = {
79
+ "no_speech_threshold": params.no_speech_threshold,
80
+ "temperature": params.temperature,
81
+ "compression_ratio_threshold": params.compression_ratio_threshold,
82
+ "logprob_threshold": params.log_prob_threshold,
83
+ }
84
+
85
+ if self.current_model_size.endswith(".en"):
86
+ pass
87
+ else:
88
+ kwargs["language"] = params.lang
89
+ kwargs["task"] = "translate" if params.is_translate else "transcribe"
90
+
91
+ segments = self.model(
92
+ inputs=audio,
93
+ return_timestamps=True,
94
+ chunk_length_s=params.chunk_length,
95
+ batch_size=params.batch_size,
96
+ generate_kwargs=kwargs
97
+ )
98
+
99
+ segments_result = self.format_result(
100
+ transcribed_result=segments,
101
+ )
102
+ elapsed_time = time.time() - start_time
103
+ return segments_result, elapsed_time
104
+
105
+ def update_model(self,
106
+ model_size: str,
107
+ compute_type: str,
108
+ progress: gr.Progress = gr.Progress(),
109
+ ):
110
+ """
111
+ Update current model setting
112
+
113
+ Parameters
114
+ ----------
115
+ model_size: str
116
+ Size of whisper model
117
+ compute_type: str
118
+ Compute type for transcription.
119
+ see more info : https://opennmt.net/CTranslate2/quantization.html
120
+ progress: gr.Progress
121
+ Indicator to show progress directly in gradio.
122
+ """
123
+ progress(0, desc="Initializing Model..")
124
+ model_path = os.path.join(self.model_dir, model_size)
125
+ if not os.path.isdir(model_path) or not os.listdir(model_path):
126
+ self.download_model(
127
+ model_size=model_size,
128
+ download_root=model_path,
129
+ progress=progress
130
+ )
131
+
132
+ self.current_compute_type = compute_type
133
+ self.current_model_size = model_size
134
+ self.model = pipeline(
135
+ "automatic-speech-recognition",
136
+ model=os.path.join(self.model_dir, model_size),
137
+ torch_dtype=self.current_compute_type,
138
+ device=self.device,
139
+ model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
140
+ )
141
+
142
+ @staticmethod
143
+ def format_result(
144
+ transcribed_result: dict
145
+ ) -> List[dict]:
146
+ """
147
+ Format the transcription result of insanely_fast_whisper as the same with other implementation.
148
+
149
+ Parameters
150
+ ----------
151
+ transcribed_result: dict
152
+ Transcription result of the insanely_fast_whisper
153
+
154
+ Returns
155
+ ----------
156
+ result: List[dict]
157
+ Formatted result as the same with other implementation
158
+ """
159
+ result = transcribed_result["chunks"]
160
+ for item in result:
161
+ start, end = item["timestamp"][0], item["timestamp"][1]
162
+ if end is None:
163
+ end = start
164
+ item["start"] = start
165
+ item["end"] = end
166
+ return result
167
+
168
+ @staticmethod
169
+ def download_model(
170
+ model_size: str,
171
+ download_root: str,
172
+ progress: gr.Progress
173
+ ):
174
+ progress(0, 'Initializing model..')
175
+ print(f'Downloading {model_size} to "{download_root}"....')
176
+
177
+ os.makedirs(download_root, exist_ok=True)
178
+ download_list = [
179
+ "model.safetensors",
180
+ "config.json",
181
+ "generation_config.json",
182
+ "preprocessor_config.json",
183
+ "tokenizer.json",
184
+ "tokenizer_config.json",
185
+ "added_tokens.json",
186
+ "special_tokens_map.json",
187
+ "vocab.json",
188
+ ]
189
+
190
+ if model_size.startswith("distil"):
191
+ repo_id = f"distil-whisper/{model_size}"
192
+ else:
193
+ repo_id = f"openai/whisper-{model_size}"
194
+ for item in download_list:
195
+ hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)
modules/whisper/whisper_Inference.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import gradio as gr
3
+ import time
4
+ from typing import BinaryIO, Union, Tuple, List
5
+ import numpy as np
6
+ import torch
7
+ import os
8
+ from argparse import Namespace
9
+
10
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
11
+ from modules.whisper.whisper_base import WhisperBase
12
+ from modules.whisper.whisper_parameter import *
13
+
14
+
15
+ class WhisperInference(WhisperBase):
16
+ def __init__(self,
17
+ model_dir: str = WHISPER_MODELS_DIR,
18
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
19
+ uvr_model_dir: str = UVR_MODELS_DIR,
20
+ output_dir: str = OUTPUT_DIR,
21
+ ):
22
+ super().__init__(
23
+ model_dir=model_dir,
24
+ output_dir=output_dir,
25
+ diarization_model_dir=diarization_model_dir,
26
+ uvr_model_dir=uvr_model_dir
27
+ )
28
+
29
+ def transcribe(self,
30
+ audio: Union[str, np.ndarray, torch.Tensor],
31
+ progress: gr.Progress = gr.Progress(),
32
+ *whisper_params,
33
+ ) -> Tuple[List[dict], float]:
34
+ """
35
+ transcribe method for faster-whisper.
36
+
37
+ Parameters
38
+ ----------
39
+ audio: Union[str, BinaryIO, np.ndarray]
40
+ Audio path or file binary or Audio numpy array
41
+ progress: gr.Progress
42
+ Indicator to show progress directly in gradio.
43
+ *whisper_params: tuple
44
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
45
+
46
+ Returns
47
+ ----------
48
+ segments_result: List[dict]
49
+ list of dicts that includes start, end timestamps and transcribed text
50
+ elapsed_time: float
51
+ elapsed time for transcription
52
+ """
53
+ start_time = time.time()
54
+ params = WhisperParameters.as_value(*whisper_params)
55
+
56
+ if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
57
+ self.update_model(params.model_size, params.compute_type, progress)
58
+
59
+ def progress_callback(progress_value):
60
+ progress(progress_value, desc="Transcribing..")
61
+
62
+ segments_result = self.model.transcribe(audio=audio,
63
+ language=params.lang,
64
+ verbose=False,
65
+ beam_size=params.beam_size,
66
+ logprob_threshold=params.log_prob_threshold,
67
+ no_speech_threshold=params.no_speech_threshold,
68
+ task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
69
+ fp16=True if params.compute_type == "float16" else False,
70
+ best_of=params.best_of,
71
+ patience=params.patience,
72
+ temperature=params.temperature,
73
+ compression_ratio_threshold=params.compression_ratio_threshold,
74
+ progress_callback=progress_callback,)["segments"]
75
+ elapsed_time = time.time() - start_time
76
+
77
+ return segments_result, elapsed_time
78
+
79
+ def update_model(self,
80
+ model_size: str,
81
+ compute_type: str,
82
+ progress: gr.Progress = gr.Progress(),
83
+ ):
84
+ """
85
+ Update current model setting
86
+
87
+ Parameters
88
+ ----------
89
+ model_size: str
90
+ Size of whisper model
91
+ compute_type: str
92
+ Compute type for transcription.
93
+ see more info : https://opennmt.net/CTranslate2/quantization.html
94
+ progress: gr.Progress
95
+ Indicator to show progress directly in gradio.
96
+ """
97
+ progress(0, desc="Initializing Model..")
98
+ self.current_compute_type = compute_type
99
+ self.current_model_size = model_size
100
+ self.model = whisper.load_model(
101
+ name=model_size,
102
+ device=self.device,
103
+ download_root=self.model_dir
104
+ )
modules/whisper/whisper_base.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import whisper
4
+ import gradio as gr
5
+ import torchaudio
6
+ from abc import ABC, abstractmethod
7
+ from typing import BinaryIO, Union, Tuple, List
8
+ import numpy as np
9
+ from datetime import datetime
10
+ from faster_whisper.vad import VadOptions
11
+ from dataclasses import astuple
12
+
13
+ from modules.uvr.music_separator import MusicSeparator
14
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
15
+ UVR_MODELS_DIR)
16
+ from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
17
+ from modules.utils.youtube_manager import get_ytdata, get_ytaudio
18
+ from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
19
+ from modules.whisper.whisper_parameter import *
20
+ from modules.diarize.diarizer import Diarizer
21
+ from modules.vad.silero_vad import SileroVAD
22
+
23
+
24
+ class WhisperBase(ABC):
25
+ def __init__(self,
26
+ model_dir: str = WHISPER_MODELS_DIR,
27
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
28
+ uvr_model_dir: str = UVR_MODELS_DIR,
29
+ output_dir: str = OUTPUT_DIR,
30
+ ):
31
+ self.model_dir = model_dir
32
+ self.output_dir = output_dir
33
+ os.makedirs(self.output_dir, exist_ok=True)
34
+ os.makedirs(self.model_dir, exist_ok=True)
35
+ self.diarizer = Diarizer(
36
+ model_dir=diarization_model_dir
37
+ )
38
+ self.vad = SileroVAD()
39
+ self.music_separator = MusicSeparator(
40
+ model_dir=uvr_model_dir,
41
+ output_dir=os.path.join(output_dir, "UVR")
42
+ )
43
+
44
+ self.model = None
45
+ self.current_model_size = None
46
+ self.available_models = whisper.available_models()
47
+ self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
48
+ self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
49
+ self.device = self.get_device()
50
+ self.available_compute_types = ["float16", "float32"]
51
+ self.current_compute_type = "float16" if self.device == "cuda" else "float32"
52
+
53
+ @abstractmethod
54
+ def transcribe(self,
55
+ audio: Union[str, BinaryIO, np.ndarray],
56
+ progress: gr.Progress = gr.Progress(),
57
+ *whisper_params,
58
+ ):
59
+ """Inference whisper model to transcribe"""
60
+ pass
61
+
62
+ @abstractmethod
63
+ def update_model(self,
64
+ model_size: str,
65
+ compute_type: str,
66
+ progress: gr.Progress = gr.Progress()
67
+ ):
68
+ """Initialize whisper model"""
69
+ pass
70
+
71
+ def run(self,
72
+ audio: Union[str, BinaryIO, np.ndarray],
73
+ progress: gr.Progress = gr.Progress(),
74
+ add_timestamp: bool = True,
75
+ *whisper_params,
76
+ ) -> Tuple[List[dict], float]:
77
+ """
78
+ Run transcription with conditional pre-processing and post-processing.
79
+ The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
80
+ The diarization will be performed in post-processing, if enabled.
81
+
82
+ Parameters
83
+ ----------
84
+ audio: Union[str, BinaryIO, np.ndarray]
85
+ Audio input. This can be file path or binary type.
86
+ progress: gr.Progress
87
+ Indicator to show progress directly in gradio.
88
+ add_timestamp: bool
89
+ Whether to add a timestamp at the end of the filename.
90
+ *whisper_params: tuple
91
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
92
+
93
+ Returns
94
+ ----------
95
+ segments_result: List[dict]
96
+ list of dicts that includes start, end timestamps and transcribed text
97
+ elapsed_time: float
98
+ elapsed time for running
99
+ """
100
+ params = WhisperParameters.as_value(*whisper_params)
101
+
102
+ self.cache_parameters(
103
+ whisper_params=params,
104
+ add_timestamp=add_timestamp
105
+ )
106
+
107
+ if params.lang is None:
108
+ pass
109
+ elif params.lang == "Automatic Detection":
110
+ params.lang = None
111
+ else:
112
+ language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
113
+ params.lang = language_code_dict[params.lang]
114
+
115
+ if params.is_bgm_separate:
116
+ music, audio, _ = self.music_separator.separate(
117
+ audio=audio,
118
+ model_name=params.uvr_model_size,
119
+ device=params.uvr_device,
120
+ segment_size=params.uvr_segment_size,
121
+ save_file=params.uvr_save_file,
122
+ progress=progress
123
+ )
124
+
125
+ if audio.ndim >= 2:
126
+ audio = audio.mean(axis=1)
127
+ if self.music_separator.audio_info is None:
128
+ origin_sample_rate = 16000
129
+ else:
130
+ origin_sample_rate = self.music_separator.audio_info.sample_rate
131
+ audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
132
+
133
+ if params.uvr_enable_offload:
134
+ self.music_separator.offload()
135
+
136
+ if params.vad_filter:
137
+ # Explicit value set for float('inf') from gr.Number()
138
+ if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
139
+ params.max_speech_duration_s = float('inf')
140
+
141
+ vad_options = VadOptions(
142
+ threshold=params.threshold,
143
+ min_speech_duration_ms=params.min_speech_duration_ms,
144
+ max_speech_duration_s=params.max_speech_duration_s,
145
+ min_silence_duration_ms=params.min_silence_duration_ms,
146
+ speech_pad_ms=params.speech_pad_ms
147
+ )
148
+
149
+ audio, speech_chunks = self.vad.run(
150
+ audio=audio,
151
+ vad_parameters=vad_options,
152
+ progress=progress
153
+ )
154
+
155
+ result, elapsed_time = self.transcribe(
156
+ audio,
157
+ progress,
158
+ *astuple(params)
159
+ )
160
+
161
+ if params.vad_filter:
162
+ result = self.vad.restore_speech_timestamps(
163
+ segments=result,
164
+ speech_chunks=speech_chunks,
165
+ )
166
+
167
+ if params.is_diarize:
168
+ result, elapsed_time_diarization = self.diarizer.run(
169
+ audio=audio,
170
+ use_auth_token=params.hf_token,
171
+ transcribed_result=result,
172
+ )
173
+ elapsed_time += elapsed_time_diarization
174
+ return result, elapsed_time
175
+
176
+ def transcribe_file(self,
177
+ files: Optional[List] = None,
178
+ input_folder_path: Optional[str] = None,
179
+ file_format: str = "SRT",
180
+ add_timestamp: bool = True,
181
+ progress=gr.Progress(),
182
+ *whisper_params,
183
+ ) -> list:
184
+ """
185
+ Write subtitle file from Files
186
+
187
+ Parameters
188
+ ----------
189
+ files: list
190
+ List of files to transcribe from gr.Files()
191
+ input_folder_path: str
192
+ Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and
193
+ this will be used instead.
194
+ file_format: str
195
+ Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
196
+ add_timestamp: bool
197
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
198
+ progress: gr.Progress
199
+ Indicator to show progress directly in gradio.
200
+ *whisper_params: tuple
201
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
202
+
203
+ Returns
204
+ ----------
205
+ result_str:
206
+ Result of transcription to return to gr.Textbox()
207
+ result_file_path:
208
+ Output file path to return to gr.Files()
209
+ """
210
+ try:
211
+ if input_folder_path:
212
+ files = get_media_files(input_folder_path)
213
+ if isinstance(files, str):
214
+ files = [files]
215
+ if files and isinstance(files[0], gr.utils.NamedString):
216
+ files = [file.name for file in files]
217
+
218
+ files_info = {}
219
+ for file in files:
220
+
221
+ ## Detect language
222
+ #model = whisper.load_model("base")
223
+ params = WhisperParameters.as_value(*whisper_params)
224
+ model = whisper.load_model(params.model_size)
225
+ mel = whisper.log_mel_spectrogram(whisper.pad_or_trim(whisper.load_audio(file))).to(model.device)
226
+ _, probs = model.detect_language(mel)
227
+ file_language = "not"
228
+ for key,value in whisper.tokenizer.LANGUAGES.items():
229
+ if key == str(max(probs, key=probs.get)):
230
+ file_language = value.capitalize()
231
+ break
232
+
233
+ transcribed_segments, time_for_task = self.run(
234
+ file,
235
+ progress,
236
+ add_timestamp,
237
+ *whisper_params,
238
+ )
239
+
240
+ file_name, file_ext = os.path.splitext(os.path.basename(file))
241
+ subtitle, file_path = self.generate_and_write_file(
242
+ file_name=file_name,
243
+ transcribed_segments=transcribed_segments,
244
+ add_timestamp=add_timestamp,
245
+ file_format=file_format,
246
+ output_dir=self.output_dir
247
+ )
248
+
249
+ files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path, "lang": file_language}
250
+
251
+ total_result = ''
252
+ total_info = ''
253
+ total_time = 0
254
+ for file_name, info in files_info.items():
255
+ total_result += f'{info["subtitle"]}'
256
+ total_time += info["time_for_task"]
257
+ #total_info += f'{info["lang"]}'
258
+ total_info += f"Language {info['lang']} detected for file '{file_name}{file_ext}'"
259
+
260
+ #result_str = f"Processing of file '{file_name}{file_ext}' done in {self.format_time(total_time)}:\n\n{total_result}"
261
+ total_info += f"\nTranscription process done in {self.format_time(total_time)}"
262
+ result_str = total_result
263
+ result_file_path = [info['path'] for info in files_info.values()]
264
+
265
+ return [result_str, result_file_path, total_info]
266
+
267
+ except Exception as e:
268
+ print(f"Error transcribing file: {e}")
269
+ finally:
270
+ self.release_cuda_memory()
271
+
272
+ def transcribe_mic(self,
273
+ mic_audio: str,
274
+ file_format: str = "SRT",
275
+ add_timestamp: bool = True,
276
+ progress=gr.Progress(),
277
+ *whisper_params,
278
+ ) -> list:
279
+ """
280
+ Write subtitle file from microphone
281
+
282
+ Parameters
283
+ ----------
284
+ mic_audio: str
285
+ Audio file path from gr.Microphone()
286
+ file_format: str
287
+ Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
288
+ add_timestamp: bool
289
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
290
+ progress: gr.Progress
291
+ Indicator to show progress directly in gradio.
292
+ *whisper_params: tuple
293
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
294
+
295
+ Returns
296
+ ----------
297
+ result_str:
298
+ Result of transcription to return to gr.Textbox()
299
+ result_file_path:
300
+ Output file path to return to gr.Files()
301
+ """
302
+ try:
303
+ progress(0, desc="Loading Audio..")
304
+ transcribed_segments, time_for_task = self.run(
305
+ mic_audio,
306
+ progress,
307
+ add_timestamp,
308
+ *whisper_params,
309
+ )
310
+ progress(1, desc="Completed!")
311
+
312
+ subtitle, result_file_path = self.generate_and_write_file(
313
+ file_name="Mic",
314
+ transcribed_segments=transcribed_segments,
315
+ add_timestamp=add_timestamp,
316
+ file_format=file_format,
317
+ output_dir=self.output_dir
318
+ )
319
+
320
+ result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
321
+ return [result_str, result_file_path]
322
+ except Exception as e:
323
+ print(f"Error transcribing file: {e}")
324
+ finally:
325
+ self.release_cuda_memory()
326
+
327
+ def transcribe_youtube(self,
328
+ youtube_link: str,
329
+ file_format: str = "SRT",
330
+ add_timestamp: bool = True,
331
+ progress=gr.Progress(),
332
+ *whisper_params,
333
+ ) -> list:
334
+ """
335
+ Write subtitle file from Youtube
336
+
337
+ Parameters
338
+ ----------
339
+ youtube_link: str
340
+ URL of the Youtube video to transcribe from gr.Textbox()
341
+ file_format: str
342
+ Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
343
+ add_timestamp: bool
344
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
345
+ progress: gr.Progress
346
+ Indicator to show progress directly in gradio.
347
+ *whisper_params: tuple
348
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
349
+
350
+ Returns
351
+ ----------
352
+ result_str:
353
+ Result of transcription to return to gr.Textbox()
354
+ result_file_path:
355
+ Output file path to return to gr.Files()
356
+ """
357
+ try:
358
+ progress(0, desc="Loading Audio from Youtube..")
359
+ yt = get_ytdata(youtube_link)
360
+ audio = get_ytaudio(yt)
361
+
362
+ transcribed_segments, time_for_task = self.run(
363
+ audio,
364
+ progress,
365
+ add_timestamp,
366
+ *whisper_params,
367
+ )
368
+
369
+ progress(1, desc="Completed!")
370
+
371
+ file_name = safe_filename(yt.title)
372
+ subtitle, result_file_path = self.generate_and_write_file(
373
+ file_name=file_name,
374
+ transcribed_segments=transcribed_segments,
375
+ add_timestamp=add_timestamp,
376
+ file_format=file_format,
377
+ output_dir=self.output_dir
378
+ )
379
+ result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
380
+
381
+ if os.path.exists(audio):
382
+ os.remove(audio)
383
+
384
+ return [result_str, result_file_path]
385
+
386
+ except Exception as e:
387
+ print(f"Error transcribing file: {e}")
388
+ finally:
389
+ self.release_cuda_memory()
390
+
391
+ @staticmethod
392
+ def generate_and_write_file(file_name: str,
393
+ transcribed_segments: list,
394
+ add_timestamp: bool,
395
+ file_format: str,
396
+ output_dir: str
397
+ ) -> str:
398
+ """
399
+ Writes subtitle file
400
+
401
+ Parameters
402
+ ----------
403
+ file_name: str
404
+ Output file name
405
+ transcribed_segments: list
406
+ Text segments transcribed from audio
407
+ add_timestamp: bool
408
+ Determines whether to add a timestamp to the end of the filename.
409
+ file_format: str
410
+ File format to write. Supported formats: [SRT, WebVTT, txt]
411
+ output_dir: str
412
+ Directory path of the output
413
+
414
+ Returns
415
+ ----------
416
+ content: str
417
+ Result of the transcription
418
+ output_path: str
419
+ output file path
420
+ """
421
+ if add_timestamp:
422
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
423
+ output_path = os.path.join(output_dir, f"{file_name} - {timestamp}")
424
+ else:
425
+ output_path = os.path.join(output_dir, f"{file_name}")
426
+
427
+ file_format = file_format.strip().lower()
428
+ if file_format == "srt":
429
+ content = get_srt(transcribed_segments)
430
+ output_path += '.srt'
431
+
432
+ elif file_format == "webvtt":
433
+ content = get_vtt(transcribed_segments)
434
+ output_path += '.vtt'
435
+
436
+ elif file_format == "txt":
437
+ content = get_txt(transcribed_segments)
438
+ output_path += '.txt'
439
+
440
+ write_file(content, output_path)
441
+ return content, output_path
442
+
443
+ @staticmethod
444
+ def format_time(elapsed_time: float) -> str:
445
+ """
446
+ Get {hours} {minutes} {seconds} time format string
447
+
448
+ Parameters
449
+ ----------
450
+ elapsed_time: str
451
+ Elapsed time for transcription
452
+
453
+ Returns
454
+ ----------
455
+ Time format string
456
+ """
457
+ hours, rem = divmod(elapsed_time, 3600)
458
+ minutes, seconds = divmod(rem, 60)
459
+
460
+ time_str = ""
461
+ if hours:
462
+ time_str += f"{hours} hours "
463
+ if minutes:
464
+ time_str += f"{minutes} minutes "
465
+ seconds = round(seconds)
466
+ time_str += f"{seconds} seconds"
467
+
468
+ return time_str.strip()
469
+
470
+ @staticmethod
471
+ def get_device():
472
+ if torch.cuda.is_available():
473
+ return "cuda"
474
+ elif torch.backends.mps.is_available():
475
+ if not WhisperBase.is_sparse_api_supported():
476
+ # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
477
+ return "cpu"
478
+ return "mps"
479
+ else:
480
+ return "cpu"
481
+
482
+ @staticmethod
483
+ def is_sparse_api_supported():
484
+ if not torch.backends.mps.is_available():
485
+ return False
486
+
487
+ try:
488
+ device = torch.device("mps")
489
+ sparse_tensor = torch.sparse_coo_tensor(
490
+ indices=torch.tensor([[0, 1], [2, 3]]),
491
+ values=torch.tensor([1, 2]),
492
+ size=(4, 4),
493
+ device=device
494
+ )
495
+ return True
496
+ except RuntimeError:
497
+ return False
498
+
499
+ @staticmethod
500
+ def release_cuda_memory():
501
+ """Release memory"""
502
+ if torch.cuda.is_available():
503
+ torch.cuda.empty_cache()
504
+ torch.cuda.reset_max_memory_allocated()
505
+
506
+ @staticmethod
507
+ def remove_input_files(file_paths: List[str]):
508
+ """Remove gradio cached files"""
509
+ if not file_paths:
510
+ return
511
+
512
+ for file_path in file_paths:
513
+ if file_path and os.path.exists(file_path):
514
+ os.remove(file_path)
515
+
516
+ @staticmethod
517
+ def cache_parameters(
518
+ whisper_params: WhisperValues,
519
+ add_timestamp: bool
520
+ ):
521
+ """cache parameters to the yaml file"""
522
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
523
+ cached_whisper_param = whisper_params.to_yaml()
524
+ cached_yaml = {**cached_params, **cached_whisper_param}
525
+ cached_yaml["whisper"]["add_timestamp"] = add_timestamp
526
+
527
+ save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
528
+
529
+ @staticmethod
530
+ def resample_audio(audio: Union[str, np.ndarray],
531
+ new_sample_rate: int = 16000,
532
+ original_sample_rate: Optional[int] = None,) -> np.ndarray:
533
+ """Resamples audio to 16k sample rate, standard on Whisper model"""
534
+ if isinstance(audio, str):
535
+ audio, original_sample_rate = torchaudio.load(audio)
536
+ else:
537
+ if original_sample_rate is None:
538
+ raise ValueError("original_sample_rate must be provided when audio is numpy array.")
539
+ audio = torch.from_numpy(audio)
540
+ resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
541
+ resampled_audio = resampler(audio).numpy()
542
+ return resampled_audio
modules/whisper/whisper_factory.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import os
3
+
4
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
5
+ INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
6
+ from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
+ from modules.whisper.whisper_Inference import WhisperInference
8
+ from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
9
+ from modules.whisper.whisper_base import WhisperBase
10
+
11
+
12
+ class WhisperFactory:
13
+ @staticmethod
14
+ def create_whisper_inference(
15
+ whisper_type: str,
16
+ whisper_model_dir: str = WHISPER_MODELS_DIR,
17
+ faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
18
+ insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
19
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
20
+ uvr_model_dir: str = UVR_MODELS_DIR,
21
+ output_dir: str = OUTPUT_DIR,
22
+ ) -> "WhisperBase":
23
+ """
24
+ Create a whisper inference class based on the provided whisper_type.
25
+
26
+ Parameters
27
+ ----------
28
+ whisper_type : str
29
+ The type of Whisper implementation to use. Supported values (case-insensitive):
30
+ - "faster-whisper": https://github.com/openai/whisper
31
+ - "whisper": https://github.com/openai/whisper
32
+ - "insanely-fast-whisper": https://github.com/Vaibhavs10/insanely-fast-whisper
33
+ whisper_model_dir : str
34
+ Directory path for the Whisper model.
35
+ faster_whisper_model_dir : str
36
+ Directory path for the Faster Whisper model.
37
+ insanely_fast_whisper_model_dir : str
38
+ Directory path for the Insanely Fast Whisper model.
39
+ diarization_model_dir : str
40
+ Directory path for the diarization model.
41
+ uvr_model_dir : str
42
+ Directory path for the UVR model.
43
+ output_dir : str
44
+ Directory path where output files will be saved.
45
+
46
+ Returns
47
+ -------
48
+ WhisperBase
49
+ An instance of the appropriate whisper inference class based on the whisper_type.
50
+ """
51
+ # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
52
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
53
+
54
+ whisper_type = whisper_type.lower().strip()
55
+
56
+ faster_whisper_typos = ["faster_whisper", "faster-whisper", "fasterwhisper"]
57
+ whisper_typos = ["whisper"]
58
+ insanely_fast_whisper_typos = [
59
+ "insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
60
+ "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
61
+ ]
62
+
63
+ if whisper_type in faster_whisper_typos:
64
+ return FasterWhisperInference(
65
+ model_dir=faster_whisper_model_dir,
66
+ output_dir=output_dir,
67
+ diarization_model_dir=diarization_model_dir,
68
+ uvr_model_dir=uvr_model_dir
69
+ )
70
+ elif whisper_type in whisper_typos:
71
+ return WhisperInference(
72
+ model_dir=whisper_model_dir,
73
+ output_dir=output_dir,
74
+ diarization_model_dir=diarization_model_dir,
75
+ uvr_model_dir=uvr_model_dir
76
+ )
77
+ elif whisper_type in insanely_fast_whisper_typos:
78
+ return InsanelyFastWhisperInference(
79
+ model_dir=insanely_fast_whisper_model_dir,
80
+ output_dir=output_dir,
81
+ diarization_model_dir=diarization_model_dir,
82
+ uvr_model_dir=uvr_model_dir
83
+ )
84
+ else:
85
+ return FasterWhisperInference(
86
+ model_dir=faster_whisper_model_dir,
87
+ output_dir=output_dir,
88
+ diarization_model_dir=diarization_model_dir,
89
+ uvr_model_dir=uvr_model_dir
90
+ )
modules/whisper/whisper_parameter.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, fields
2
+ import gradio as gr
3
+ from typing import Optional, Dict
4
+ import yaml
5
+
6
+
7
+ @dataclass
8
+ class WhisperParameters:
9
+ model_size: gr.Dropdown
10
+ lang: gr.Dropdown
11
+ is_translate: gr.Checkbox
12
+ beam_size: gr.Number
13
+ log_prob_threshold: gr.Number
14
+ no_speech_threshold: gr.Number
15
+ compute_type: gr.Dropdown
16
+ best_of: gr.Number
17
+ patience: gr.Number
18
+ condition_on_previous_text: gr.Checkbox
19
+ prompt_reset_on_temperature: gr.Slider
20
+ initial_prompt: gr.Textbox
21
+ temperature: gr.Slider
22
+ compression_ratio_threshold: gr.Number
23
+ vad_filter: gr.Checkbox
24
+ threshold: gr.Slider
25
+ min_speech_duration_ms: gr.Number
26
+ max_speech_duration_s: gr.Number
27
+ min_silence_duration_ms: gr.Number
28
+ speech_pad_ms: gr.Number
29
+ batch_size: gr.Number
30
+ is_diarize: gr.Checkbox
31
+ hf_token: gr.Textbox
32
+ diarization_device: gr.Dropdown
33
+ length_penalty: gr.Number
34
+ repetition_penalty: gr.Number
35
+ no_repeat_ngram_size: gr.Number
36
+ prefix: gr.Textbox
37
+ suppress_blank: gr.Checkbox
38
+ suppress_tokens: gr.Textbox
39
+ max_initial_timestamp: gr.Number
40
+ word_timestamps: gr.Checkbox
41
+ prepend_punctuations: gr.Textbox
42
+ append_punctuations: gr.Textbox
43
+ max_new_tokens: gr.Number
44
+ chunk_length: gr.Number
45
+ hallucination_silence_threshold: gr.Number
46
+ hotwords: gr.Textbox
47
+ language_detection_threshold: gr.Number
48
+ language_detection_segments: gr.Number
49
+ is_bgm_separate: gr.Checkbox
50
+ uvr_model_size: gr.Dropdown
51
+ uvr_device: gr.Dropdown
52
+ uvr_segment_size: gr.Number
53
+ uvr_save_file: gr.Checkbox
54
+ uvr_enable_offload: gr.Checkbox
55
+ """
56
+ A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
57
+ This data class is used to mitigate the key-value problem between Gradio components and function parameters.
58
+ Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
59
+ See more about Gradio pre-processing: https://www.gradio.app/docs/components
60
+
61
+ Attributes
62
+ ----------
63
+ model_size: gr.Dropdown
64
+ Whisper model size.
65
+
66
+ lang: gr.Dropdown
67
+ Source language of the file to transcribe.
68
+
69
+ is_translate: gr.Checkbox
70
+ Boolean value that determines whether to translate to English.
71
+ It's Whisper's feature to translate speech from another language directly into English end-to-end.
72
+
73
+ beam_size: gr.Number
74
+ Int value that is used for decoding option.
75
+
76
+ log_prob_threshold: gr.Number
77
+ If the average log probability over sampled tokens is below this value, treat as failed.
78
+
79
+ no_speech_threshold: gr.Number
80
+ If the no_speech probability is higher than this value AND
81
+ the average log probability over sampled tokens is below `log_prob_threshold`,
82
+ consider the segment as silent.
83
+
84
+ compute_type: gr.Dropdown
85
+ compute type for transcription.
86
+ see more info : https://opennmt.net/CTranslate2/quantization.html
87
+
88
+ best_of: gr.Number
89
+ Number of candidates when sampling with non-zero temperature.
90
+
91
+ patience: gr.Number
92
+ Beam search patience factor.
93
+
94
+ condition_on_previous_text: gr.Checkbox
95
+ if True, the previous output of the model is provided as a prompt for the next window;
96
+ disabling may make the text inconsistent across windows, but the model becomes less prone to
97
+ getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
98
+
99
+ initial_prompt: gr.Textbox
100
+ Optional text to provide as a prompt for the first window. This can be used to provide, or
101
+ "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
102
+ to make it more likely to predict those word correctly.
103
+
104
+ temperature: gr.Slider
105
+ Temperature for sampling. It can be a tuple of temperatures,
106
+ which will be successively used upon failures according to either
107
+ `compression_ratio_threshold` or `log_prob_threshold`.
108
+
109
+ compression_ratio_threshold: gr.Number
110
+ If the gzip compression ratio is above this value, treat as failed
111
+
112
+ vad_filter: gr.Checkbox
113
+ Enable the voice activity detection (VAD) to filter out parts of the audio
114
+ without speech. This step is using the Silero VAD model
115
+ https://github.com/snakers4/silero-vad.
116
+
117
+ threshold: gr.Slider
118
+ This parameter is related with Silero VAD. Speech threshold.
119
+ Silero VAD outputs speech probabilities for each audio chunk,
120
+ probabilities ABOVE this value are considered as SPEECH. It is better to tune this
121
+ parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
122
+
123
+ min_speech_duration_ms: gr.Number
124
+ This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
125
+
126
+ max_speech_duration_s: gr.Number
127
+ This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
128
+ than max_speech_duration_s will be split at the timestamp of the last silence that
129
+ lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
130
+ split aggressively just before max_speech_duration_s.
131
+
132
+ min_silence_duration_ms: gr.Number
133
+ This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
134
+ before separating it
135
+
136
+ speech_pad_ms: gr.Number
137
+ This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
138
+
139
+ batch_size: gr.Number
140
+ This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
141
+
142
+ is_diarize: gr.Checkbox
143
+ This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
144
+
145
+ hf_token: gr.Textbox
146
+ This parameter is related with whisperx. Huggingface token is needed to download diarization models.
147
+ Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
148
+
149
+ diarization_device: gr.Dropdown
150
+ This parameter is related with whisperx. Device to run diarization model
151
+
152
+ length_penalty: gr.Number
153
+ This parameter is related to faster-whisper. Exponential length penalty constant.
154
+
155
+ repetition_penalty: gr.Number
156
+ This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
157
+ (set > 1 to penalize).
158
+
159
+ no_repeat_ngram_size: gr.Number
160
+ This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
161
+
162
+ prefix: gr.Textbox
163
+ This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
164
+
165
+ suppress_blank: gr.Checkbox
166
+ This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
167
+
168
+ suppress_tokens: gr.Textbox
169
+ This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
170
+ of symbols as defined in the model config.json file.
171
+
172
+ max_initial_timestamp: gr.Number
173
+ This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
174
+
175
+ word_timestamps: gr.Checkbox
176
+ This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
177
+ and dynamic time warping, and include the timestamps for each word in each segment.
178
+
179
+ prepend_punctuations: gr.Textbox
180
+ This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
181
+ with the next word.
182
+
183
+ append_punctuations: gr.Textbox
184
+ This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
185
+ with the previous word.
186
+
187
+ max_new_tokens: gr.Number
188
+ This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
189
+ the maximum will be set by the default max_length.
190
+
191
+ chunk_length: gr.Number
192
+ This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
193
+ If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
194
+
195
+ hallucination_silence_threshold: gr.Number
196
+ This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
197
+ (in seconds) when a possible hallucination is detected.
198
+
199
+ hotwords: gr.Textbox
200
+ This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
201
+
202
+ language_detection_threshold: gr.Number
203
+ This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
204
+
205
+ language_detection_segments: gr.Number
206
+ This parameter is related to faster-whisper. Number of segments to consider for the language detection.
207
+
208
+ is_separate_bgm: gr.Checkbox
209
+ This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
210
+
211
+ uvr_model_size: gr.Dropdown
212
+ This parameter is related to UVR. UVR model size.
213
+
214
+ uvr_device: gr.Dropdown
215
+ This parameter is related to UVR. Device to run UVR model.
216
+
217
+ uvr_segment_size: gr.Number
218
+ This parameter is related to UVR. Segment size for UVR model.
219
+
220
+ uvr_save_file: gr.Checkbox
221
+ This parameter is related to UVR. Boolean value that determines whether to save the file or not.
222
+
223
+ uvr_enable_offload: gr.Checkbox
224
+ This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
225
+ after each transcription.
226
+ """
227
+
228
+ def as_list(self) -> list:
229
+ """
230
+ Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
231
+ See more about Gradio pre-processing: : https://www.gradio.app/docs/components
232
+
233
+ Returns
234
+ ----------
235
+ A list of Gradio components
236
+ """
237
+ return [getattr(self, f.name) for f in fields(self)]
238
+
239
+ @staticmethod
240
+ def as_value(*args) -> 'WhisperValues':
241
+ """
242
+ To use Whisper parameters in function after Gradio post-processing.
243
+ See more about Gradio post-processing: : https://www.gradio.app/docs/components
244
+
245
+ Returns
246
+ ----------
247
+ WhisperValues
248
+ Data class that has values of parameters
249
+ """
250
+ return WhisperValues(*args)
251
+
252
+
253
+ @dataclass
254
+ class WhisperValues:
255
+ model_size: str = "large-v2"
256
+ lang: Optional[str] = None
257
+ is_translate: bool = False
258
+ beam_size: int = 5
259
+ log_prob_threshold: float = -1.0
260
+ no_speech_threshold: float = 0.6
261
+ compute_type: str = "float16"
262
+ best_of: int = 5
263
+ patience: float = 1.0
264
+ condition_on_previous_text: bool = True
265
+ prompt_reset_on_temperature: float = 0.5
266
+ initial_prompt: Optional[str] = None
267
+ temperature: float = 0.0
268
+ compression_ratio_threshold: float = 2.4
269
+ vad_filter: bool = False
270
+ threshold: float = 0.5
271
+ min_speech_duration_ms: int = 250
272
+ max_speech_duration_s: float = float("inf")
273
+ min_silence_duration_ms: int = 2000
274
+ speech_pad_ms: int = 400
275
+ batch_size: int = 24
276
+ is_diarize: bool = False
277
+ hf_token: str = ""
278
+ diarization_device: str = "cuda"
279
+ length_penalty: float = 1.0
280
+ repetition_penalty: float = 1.0
281
+ no_repeat_ngram_size: int = 0
282
+ prefix: Optional[str] = None
283
+ suppress_blank: bool = True
284
+ suppress_tokens: Optional[str] = "[-1]"
285
+ max_initial_timestamp: float = 0.0
286
+ word_timestamps: bool = False
287
+ prepend_punctuations: Optional[str] = "\"'“¿([{-"
288
+ append_punctuations: Optional[str] = "\"'.。,,!!??::”)]}、"
289
+ max_new_tokens: Optional[int] = None
290
+ chunk_length: Optional[int] = 30
291
+ hallucination_silence_threshold: Optional[float] = None
292
+ hotwords: Optional[str] = None
293
+ language_detection_threshold: Optional[float] = None
294
+ language_detection_segments: int = 1
295
+ is_bgm_separate: bool = False
296
+ uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
297
+ uvr_device: str = "cuda"
298
+ uvr_segment_size: int = 256
299
+ uvr_save_file: bool = False
300
+ uvr_enable_offload: bool = True
301
+ """
302
+ A data class to use Whisper parameters.
303
+ """
304
+
305
+ def to_yaml(self) -> Dict:
306
+ data = {
307
+ "whisper": {
308
+ "model_size": self.model_size,
309
+ "lang": "Automatic Detection" if self.lang is None else self.lang,
310
+ "is_translate": self.is_translate,
311
+ "beam_size": self.beam_size,
312
+ "log_prob_threshold": self.log_prob_threshold,
313
+ "no_speech_threshold": self.no_speech_threshold,
314
+ "best_of": self.best_of,
315
+ "patience": self.patience,
316
+ "condition_on_previous_text": self.condition_on_previous_text,
317
+ "prompt_reset_on_temperature": self.prompt_reset_on_temperature,
318
+ "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
319
+ "temperature": self.temperature,
320
+ "compression_ratio_threshold": self.compression_ratio_threshold,
321
+ "batch_size": self.batch_size,
322
+ "length_penalty": self.length_penalty,
323
+ "repetition_penalty": self.repetition_penalty,
324
+ "no_repeat_ngram_size": self.no_repeat_ngram_size,
325
+ "prefix": None if not self.prefix else self.prefix,
326
+ "suppress_blank": self.suppress_blank,
327
+ "suppress_tokens": self.suppress_tokens,
328
+ "max_initial_timestamp": self.max_initial_timestamp,
329
+ "word_timestamps": self.word_timestamps,
330
+ "prepend_punctuations": self.prepend_punctuations,
331
+ "append_punctuations": self.append_punctuations,
332
+ "max_new_tokens": self.max_new_tokens,
333
+ "chunk_length": self.chunk_length,
334
+ "hallucination_silence_threshold": self.hallucination_silence_threshold,
335
+ "hotwords": None if not self.hotwords else self.hotwords,
336
+ "language_detection_threshold": self.language_detection_threshold,
337
+ "language_detection_segments": self.language_detection_segments,
338
+ },
339
+ "vad": {
340
+ "vad_filter": self.vad_filter,
341
+ "threshold": self.threshold,
342
+ "min_speech_duration_ms": self.min_speech_duration_ms,
343
+ "max_speech_duration_s": self.max_speech_duration_s,
344
+ "min_silence_duration_ms": self.min_silence_duration_ms,
345
+ "speech_pad_ms": self.speech_pad_ms,
346
+ },
347
+ "diarization": {
348
+ "is_diarize": self.is_diarize,
349
+ "hf_token": self.hf_token
350
+ },
351
+ "bgm_separation": {
352
+ "is_separate_bgm": self.is_bgm_separate,
353
+ "model_size": self.uvr_model_size,
354
+ "segment_size": self.uvr_segment_size,
355
+ "save_file": self.uvr_save_file,
356
+ "enable_offload": self.uvr_enable_offload
357
+ },
358
+ }
359
+ return data
360
+
361
+ def as_list(self) -> list:
362
+ """
363
+ Converts the data class attributes into a list
364
+
365
+ Returns
366
+ ----------
367
+ A list of Whisper parameters
368
+ """
369
+ return [getattr(self, f.name) for f in fields(self)]
notebook/whisper-webui.ipynb ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "source": [
6
+ "---\n",
7
+ "\n",
8
+ "📌 **This notebook has been updated [here](https://github.com/jhj0517/Whisper-WebUI.git)!**\n",
9
+ "\n",
10
+ "🖋 **Author**: [jhj0517](https://github.com/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)\n",
11
+ "\n",
12
+ "😎 **Support the Project**:\n",
13
+ "\n",
14
+ "If you find this project useful, please consider supporting it:\n",
15
+ "\n",
16
+ "<a href=\"https://ko-fi.com/jhj0517\" target=\"_blank\">\n",
17
+ " <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
18
+ "</a>\n",
19
+ "\n",
20
+ "---"
21
+ ],
22
+ "metadata": {
23
+ "id": "doKhBBXIfS21"
24
+ }
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "source": [
29
+ "#@title #(Optional) Check GPU\n",
30
+ "#@markdown Some models may not function correctly on a CPU runtime.\n",
31
+ "\n",
32
+ "#@markdown so you should check your GPU setup before run.\n",
33
+ "!nvidia-smi"
34
+ ],
35
+ "metadata": {
36
+ "id": "23yZvUlagEsx",
37
+ "cellView": "form"
38
+ },
39
+ "execution_count": null,
40
+ "outputs": []
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {
46
+ "id": "kNbSbsctxahq",
47
+ "cellView": "form"
48
+ },
49
+ "outputs": [],
50
+ "source": [
51
+ "#@title #Installation\n",
52
+ "#@markdown This cell will install dependencies for Whisper-WebUI!\n",
53
+ "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
54
+ "%cd Whisper-WebUI\n",
55
+ "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
56
+ "!pip install faster-whisper==1.0.3\n",
57
+ "!pip install gradio==4.43.0\n",
58
+ "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
59
+ "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
60
+ "!pip install tokenizers==0.19.1\n",
61
+ "!pip install pyannote.audio==3.3.1\n",
62
+ "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "source": [
68
+ "#@title # (Optional) Configure arguments\n",
69
+ "#@markdown This section is used to configure some command line arguments.\n",
70
+ "\n",
71
+ "#@markdown You can simply ignore this section and the default values will be used.\n",
72
+ "\n",
73
+ "USERNAME = '' #@param {type: \"string\"}\n",
74
+ "PASSWORD = '' #@param {type: \"string\"}\n",
75
+ "WHISPER_TYPE = 'faster-whisper' # @param [\"whisper\", \"faster-whisper\", \"insanely-fast-whisper\"]\n",
76
+ "THEME = '' #@param {type: \"string\"}\n",
77
+ "\n",
78
+ "arguments = \"\"\n",
79
+ "if USERNAME:\n",
80
+ " arguments += f\" --username {USERNAME}\"\n",
81
+ "if PASSWORD:\n",
82
+ " arguments += f\" --password {PASSWORD}\"\n",
83
+ "if THEME:\n",
84
+ " arguments += f\" --theme {THEME}\"\n",
85
+ "if WHISPER_TYPE:\n",
86
+ " arguments += f\" --whisper_type {WHISPER_TYPE}\"\n",
87
+ "\n",
88
+ "\n",
89
+ "#@markdown If you wonder how these arguments are used, you can see the [Wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments)."
90
+ ],
91
+ "metadata": {
92
+ "id": "Qosz9BFlGui3",
93
+ "cellView": "form"
94
+ },
95
+ "execution_count": null,
96
+ "outputs": []
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 3,
101
+ "metadata": {
102
+ "id": "PQroYRRZzQiN",
103
+ "cellView": "form"
104
+ },
105
+ "outputs": [],
106
+ "source": [
107
+ "#@title #Run\n",
108
+ "#@markdown Once the installation is complete, you can use public URL that is displayed.\n",
109
+ "if 'arguments' in locals():\n",
110
+ " !python app.py --share --colab{arguments}\n",
111
+ "else:\n",
112
+ " !python app.py --share --colab"
113
+ ]
114
+ }
115
+ ],
116
+ "metadata": {
117
+ "colab": {
118
+ "provenance": [],
119
+ "gpuType": "T4"
120
+ },
121
+ "kernelspec": {
122
+ "display_name": "Python 3",
123
+ "name": "python3"
124
+ },
125
+ "language_info": {
126
+ "name": "python"
127
+ },
128
+ "accelerator": "GPU"
129
+ },
130
+ "nbformat": 4,
131
+ "nbformat_minor": 0
132
+ }
outputs/outputs are saved here.txt ADDED
File without changes
outputs/translations/outputs for translation are saved here.txt ADDED
File without changes