Upload 63 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +10 -0
- .github/FUNDING.yml +13 -0
- .github/ISSUE_TEMPLATE/bug_report.md +11 -0
- .github/ISSUE_TEMPLATE/feature_request.md +10 -0
- .github/ISSUE_TEMPLATE/hallucination.md +12 -0
- .github/pull_request_template.md +5 -0
- .github/workflows/ci-shell.yml +43 -0
- .github/workflows/ci.yml +41 -0
- .github/workflows/publish-docker.yml +37 -0
- .gitignore +13 -0
- Dockerfile +34 -0
- Install.bat +20 -0
- Install.sh +17 -0
- LICENSE +201 -0
- README.md +117 -12
- app.py +359 -0
- configs/default_parameters.yaml +64 -0
- demo/audio.wav +0 -0
- docker-compose.yaml +29 -0
- models/models will be saved here.txt +0 -0
- modules/__init__.py +0 -0
- modules/diarize/__init__.py +0 -0
- modules/diarize/audio_loader.py +179 -0
- modules/diarize/diarize_pipeline.py +95 -0
- modules/diarize/diarizer.py +133 -0
- modules/translation/__init__.py +0 -0
- modules/translation/deepl_api.py +226 -0
- modules/translation/nllb_inference.py +287 -0
- modules/translation/translation_base.py +177 -0
- modules/ui/__init__.py +0 -0
- modules/ui/htmls.py +97 -0
- modules/utils/__init__.py +0 -0
- modules/utils/cli_manager.py +12 -0
- modules/utils/files_manager.py +69 -0
- modules/utils/paths.py +31 -0
- modules/utils/subtitle_manager.py +132 -0
- modules/utils/youtube_manager.py +33 -0
- modules/uvr/music_separator.py +183 -0
- modules/vad/__init__.py +0 -0
- modules/vad/silero_vad.py +264 -0
- modules/whisper/__init__.py +0 -0
- modules/whisper/faster_whisper_inference.py +192 -0
- modules/whisper/insanely_fast_whisper_inference.py +195 -0
- modules/whisper/whisper_Inference.py +104 -0
- modules/whisper/whisper_base.py +542 -0
- modules/whisper/whisper_factory.py +90 -0
- modules/whisper/whisper_parameter.py +369 -0
- notebook/whisper-webui.ipynb +132 -0
- outputs/outputs are saved here.txt +0 -0
- outputs/translations/outputs for translation are saved here.txt +0 -0
.dockerignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from .gitignore
|
2 |
+
venv/
|
3 |
+
ui/__pycache__/
|
4 |
+
outputs/
|
5 |
+
modules/__pycache__/
|
6 |
+
models/
|
7 |
+
modules/yt_tmp.wav
|
8 |
+
|
9 |
+
.git
|
10 |
+
.github
|
.github/FUNDING.yml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# These are supported funding model platforms
|
2 |
+
|
3 |
+
github: []
|
4 |
+
patreon: # Replace with a single Patreon username
|
5 |
+
open_collective: # Replace with a single Open Collective username
|
6 |
+
ko_fi: jhj0517
|
7 |
+
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
8 |
+
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
9 |
+
liberapay: # Replace with a single Liberapay username
|
10 |
+
issuehunt: # Replace with a single IssueHunt username
|
11 |
+
otechie: # Replace with a single Otechie username
|
12 |
+
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
|
13 |
+
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
.github/ISSUE_TEMPLATE/bug_report.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
name: Bug report
|
3 |
+
about: Create a report to help us improve
|
4 |
+
title: ''
|
5 |
+
labels: bug
|
6 |
+
assignees: jhj0517
|
7 |
+
|
8 |
+
---
|
9 |
+
|
10 |
+
**Which OS are you using?**
|
11 |
+
- OS: [e.g. iOS or Windows.. If you are using Google Colab, just Colab.]
|
.github/ISSUE_TEMPLATE/feature_request.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
name: Feature request
|
3 |
+
about: Any feature you want
|
4 |
+
title: ''
|
5 |
+
labels: enhancement
|
6 |
+
assignees: jhj0517
|
7 |
+
|
8 |
+
---
|
9 |
+
|
10 |
+
|
.github/ISSUE_TEMPLATE/hallucination.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
name: Hallucination
|
3 |
+
about: Whisper hallucinations. ( Repeating certain words or subtitles starting too
|
4 |
+
early, etc. )
|
5 |
+
title: ''
|
6 |
+
labels: hallucination
|
7 |
+
assignees: jhj0517
|
8 |
+
|
9 |
+
---
|
10 |
+
|
11 |
+
**Download URL for sample audio**
|
12 |
+
- Please upload download URL for sample audio file so I can test with some settings for better result. You can use https://easyupload.io/ or any other service to share.
|
.github/pull_request_template.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Related issues
|
2 |
+
- #0
|
3 |
+
|
4 |
+
## Changed
|
5 |
+
1. Changes
|
.github/workflows/ci-shell.yml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI-Shell Script
|
2 |
+
|
3 |
+
on:
|
4 |
+
workflow_dispatch:
|
5 |
+
|
6 |
+
push:
|
7 |
+
branches:
|
8 |
+
- master
|
9 |
+
pull_request:
|
10 |
+
branches:
|
11 |
+
- master
|
12 |
+
|
13 |
+
jobs:
|
14 |
+
test-shell-script:
|
15 |
+
|
16 |
+
runs-on: ubuntu-latest
|
17 |
+
strategy:
|
18 |
+
matrix:
|
19 |
+
python: [ "3.10" ]
|
20 |
+
|
21 |
+
steps:
|
22 |
+
- name: Clean up space for action
|
23 |
+
run: rm -rf /opt/hostedtoolcache
|
24 |
+
|
25 |
+
- uses: actions/checkout@v4
|
26 |
+
- name: Setup Python
|
27 |
+
uses: actions/setup-python@v5
|
28 |
+
with:
|
29 |
+
python-version: ${{ matrix.python }}
|
30 |
+
|
31 |
+
- name: Install git and ffmpeg
|
32 |
+
run: sudo apt-get update && sudo apt-get install -y git ffmpeg
|
33 |
+
|
34 |
+
- name: Execute Install.sh
|
35 |
+
run: |
|
36 |
+
chmod +x ./Install.sh
|
37 |
+
./Install.sh
|
38 |
+
|
39 |
+
- name: Execute start-webui.sh
|
40 |
+
run: |
|
41 |
+
chmod +x ./start-webui.sh
|
42 |
+
timeout 60s ./start-webui.sh || true
|
43 |
+
|
.github/workflows/ci.yml
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: CI
|
2 |
+
|
3 |
+
on:
|
4 |
+
workflow_dispatch:
|
5 |
+
|
6 |
+
push:
|
7 |
+
branches:
|
8 |
+
- master
|
9 |
+
pull_request:
|
10 |
+
branches:
|
11 |
+
- master
|
12 |
+
|
13 |
+
jobs:
|
14 |
+
build:
|
15 |
+
|
16 |
+
runs-on: ubuntu-latest
|
17 |
+
strategy:
|
18 |
+
matrix:
|
19 |
+
python: ["3.10"]
|
20 |
+
|
21 |
+
env:
|
22 |
+
DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
|
23 |
+
|
24 |
+
steps:
|
25 |
+
- name: Clean up space for action
|
26 |
+
run: rm -rf /opt/hostedtoolcache
|
27 |
+
|
28 |
+
- uses: actions/checkout@v4
|
29 |
+
- name: Setup Python
|
30 |
+
uses: actions/setup-python@v5
|
31 |
+
with:
|
32 |
+
python-version: ${{ matrix.python }}
|
33 |
+
|
34 |
+
- name: Install git and ffmpeg
|
35 |
+
run: sudo apt-get update && sudo apt-get install -y git ffmpeg
|
36 |
+
|
37 |
+
- name: Install dependencies
|
38 |
+
run: pip install -r requirements.txt pytest
|
39 |
+
|
40 |
+
- name: Run test
|
41 |
+
run: python -m pytest -rs tests
|
.github/workflows/publish-docker.yml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Publish to Docker Hub
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- master
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
build-and-push:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Log in to Docker Hub
|
14 |
+
uses: docker/login-action@v2
|
15 |
+
with:
|
16 |
+
username: ${{ secrets.DOCKER_USERNAME }}
|
17 |
+
password: ${{ secrets.DOCKER_PASSWORD }}
|
18 |
+
|
19 |
+
- name: Checkout repository
|
20 |
+
uses: actions/checkout@v3
|
21 |
+
|
22 |
+
- name: Set up Docker Buildx
|
23 |
+
uses: docker/setup-buildx-action@v3
|
24 |
+
|
25 |
+
- name: Set up QEMU
|
26 |
+
uses: docker/setup-qemu-action@v3
|
27 |
+
|
28 |
+
- name: Build and push Docker image
|
29 |
+
uses: docker/build-push-action@v5
|
30 |
+
with:
|
31 |
+
context: .
|
32 |
+
file: ./Dockerfile
|
33 |
+
push: true
|
34 |
+
tags: ${{ secrets.DOCKER_USERNAME }}/whisper-webui:latest
|
35 |
+
|
36 |
+
- name: Log out of Docker Hub
|
37 |
+
run: docker logout
|
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.wav
|
2 |
+
*.png
|
3 |
+
*.mp4
|
4 |
+
*.mp3
|
5 |
+
.idea/
|
6 |
+
.pytest_cache/
|
7 |
+
venv/
|
8 |
+
modules/ui/__pycache__/
|
9 |
+
outputs/
|
10 |
+
modules/__pycache__/
|
11 |
+
models/
|
12 |
+
modules/yt_tmp.wav
|
13 |
+
configs/default_parameters.yaml
|
Dockerfile
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM debian:bookworm-slim AS builder
|
2 |
+
|
3 |
+
RUN apt-get update && \
|
4 |
+
apt-get install -y curl git python3 python3-pip python3-venv && \
|
5 |
+
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \
|
6 |
+
mkdir -p /Whisper-WebUI
|
7 |
+
|
8 |
+
WORKDIR /Whisper-WebUI
|
9 |
+
|
10 |
+
COPY requirements.txt .
|
11 |
+
|
12 |
+
RUN python3 -m venv venv && \
|
13 |
+
. venv/bin/activate && \
|
14 |
+
pip install --no-cache-dir -r requirements.txt
|
15 |
+
|
16 |
+
|
17 |
+
FROM debian:bookworm-slim AS runtime
|
18 |
+
|
19 |
+
RUN apt-get update && \
|
20 |
+
apt-get install -y curl ffmpeg python3 && \
|
21 |
+
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
22 |
+
|
23 |
+
WORKDIR /Whisper-WebUI
|
24 |
+
|
25 |
+
COPY . .
|
26 |
+
COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
|
27 |
+
|
28 |
+
VOLUME [ "/Whisper-WebUI/models" ]
|
29 |
+
VOLUME [ "/Whisper-WebUI/outputs" ]
|
30 |
+
|
31 |
+
ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
|
32 |
+
ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
|
33 |
+
|
34 |
+
ENTRYPOINT [ "python", "app.py" ]
|
Install.bat
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
|
3 |
+
if not exist "%~dp0\venv\Scripts" (
|
4 |
+
echo Creating venv...
|
5 |
+
python -m venv venv
|
6 |
+
)
|
7 |
+
echo checked the venv folder. now installing requirements..
|
8 |
+
|
9 |
+
call "%~dp0\venv\scripts\activate"
|
10 |
+
|
11 |
+
pip install -r requirements.txt
|
12 |
+
|
13 |
+
if errorlevel 1 (
|
14 |
+
echo.
|
15 |
+
echo Requirements installation failed. please remove venv folder and run install.bat again.
|
16 |
+
) else (
|
17 |
+
echo.
|
18 |
+
echo Requirements installed successfully.
|
19 |
+
)
|
20 |
+
pause
|
Install.sh
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
if [ ! -d "venv" ]; then
|
4 |
+
echo "Creating virtual environment..."
|
5 |
+
python -m venv venv
|
6 |
+
fi
|
7 |
+
|
8 |
+
source venv/bin/activate
|
9 |
+
|
10 |
+
pip install -r requirements.txt && echo "Requirements installed successfully." || {
|
11 |
+
echo ""
|
12 |
+
echo "Requirements installation failed. Please remove the venv folder and run the script again."
|
13 |
+
deactivate
|
14 |
+
exit 1
|
15 |
+
}
|
16 |
+
|
17 |
+
deactivate
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright 2023 jhj0517
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
CHANGED
@@ -1,12 +1,117 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Whisper-WebUI
|
2 |
+
A Gradio-based browser interface for [Whisper](https://github.com/openai/whisper). You can use it as an Easy Subtitle Generator!
|
3 |
+
|
4 |
+
![Whisper WebUI](https://github.com/jhj0517/Whsiper-WebUI/blob/master/screenshot.png)
|
5 |
+
|
6 |
+
## Notebook
|
7 |
+
If you wish to try this on Colab, you can do it in [here](https://colab.research.google.com/github/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)!
|
8 |
+
|
9 |
+
# Feature
|
10 |
+
- Select the Whisper implementation you want to use between :
|
11 |
+
- [openai/whisper](https://github.com/openai/whisper)
|
12 |
+
- [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) (used by default)
|
13 |
+
- [Vaibhavs10/insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)
|
14 |
+
- Generate subtitles from various sources, including :
|
15 |
+
- Files
|
16 |
+
- Youtube
|
17 |
+
- Microphone
|
18 |
+
- Currently supported subtitle formats :
|
19 |
+
- SRT
|
20 |
+
- WebVTT
|
21 |
+
- txt ( only text file without timeline )
|
22 |
+
- Speech to Text Translation
|
23 |
+
- From other languages to English. ( This is Whisper's end-to-end speech-to-text translation feature )
|
24 |
+
- Text to Text Translation
|
25 |
+
- Translate subtitle files using Facebook NLLB models
|
26 |
+
- Translate subtitle files using DeepL API
|
27 |
+
- Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
|
28 |
+
- Pre-processing audio input to separate BGM with [UVR](https://github.com/Anjok07/ultimatevocalremovergui), [UVR-api](https://github.com/NextAudioGen/ultimatevocalremover_api).
|
29 |
+
- Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
|
30 |
+
- To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
|
31 |
+
1. https://huggingface.co/pyannote/speaker-diarization-3.1
|
32 |
+
2. https://huggingface.co/pyannote/segmentation-3.0
|
33 |
+
|
34 |
+
# Installation and Running
|
35 |
+
### Prerequisite
|
36 |
+
To run this WebUI, you need to have `git`, `python` version 3.8 ~ 3.10, `FFmpeg`. <br>
|
37 |
+
And if you're not using an Nvida GPU, or using a different `CUDA` version than 12.4, edit the [`requirements.txt`](https://github.com/jhj0517/Whisper-WebUI/blob/master/requirements.txt) to match your environment.
|
38 |
+
|
39 |
+
Please follow the links below to install the necessary software:
|
40 |
+
- git : [https://git-scm.com/downloads](https://git-scm.com/downloads)
|
41 |
+
- python : [https://www.python.org/downloads/](https://www.python.org/downloads/) **( If your python version is too new, torch will not install properly.)**
|
42 |
+
- FFmpeg : [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html)
|
43 |
+
- CUDA : [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
|
44 |
+
|
45 |
+
After installing FFmpeg, **make sure to add the `FFmpeg/bin` folder to your system PATH!**
|
46 |
+
|
47 |
+
### Automatic Installation
|
48 |
+
|
49 |
+
1. Download `Whisper-WebUI.zip` with the file corresponding to your OS from [v1.0.0](https://github.com/jhj0517/Whisper-WebUI/releases/tag/v1.0.0) and extract its contents.
|
50 |
+
2. Run `install.bat` or `install.sh` to install dependencies. (This will create a `venv` directory and install dependencies there.)
|
51 |
+
3. Start WebUI with `start-webui.bat` or `start-webui.sh`
|
52 |
+
4. To update the WebUI, run `update.bat` or `update.sh`
|
53 |
+
|
54 |
+
And you can also run the project with command line arguments if you like to, see [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for a guide to arguments.
|
55 |
+
|
56 |
+
- ## Running with Docker
|
57 |
+
|
58 |
+
1. Install and launch [Docker-Desktop](https://www.docker.com/products/docker-desktop/).
|
59 |
+
|
60 |
+
2. Git clone the repository
|
61 |
+
|
62 |
+
```sh
|
63 |
+
git clone https://github.com/jhj0517/Whisper-WebUI.git
|
64 |
+
```
|
65 |
+
|
66 |
+
3. Build the image ( Image is about 7GB~ )
|
67 |
+
|
68 |
+
```sh
|
69 |
+
docker compose build
|
70 |
+
```
|
71 |
+
|
72 |
+
4. Run the container
|
73 |
+
|
74 |
+
```sh
|
75 |
+
docker compose up
|
76 |
+
```
|
77 |
+
|
78 |
+
5. Connect to the WebUI with your browser at `http://localhost:7860`
|
79 |
+
|
80 |
+
If needed, update the [`docker-compose.yaml`](https://github.com/jhj0517/Whisper-WebUI/blob/master/docker-compose.yaml) to match your environment.
|
81 |
+
|
82 |
+
# VRAM Usages
|
83 |
+
This project is integrated with [faster-whisper](https://github.com/guillaumekln/faster-whisper) by default for better VRAM usage and transcription speed.
|
84 |
+
|
85 |
+
According to faster-whisper, the efficiency of the optimized whisper model is as follows:
|
86 |
+
| Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
|
87 |
+
|-------------------|-----------|-----------|-------|-----------------|-----------------|
|
88 |
+
| openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
|
89 |
+
| faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
|
90 |
+
|
91 |
+
If you want to use an implementation other than faster-whisper, use `--whisper_type` arg and the repository name.<br>
|
92 |
+
Read [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for more info about CLI args.
|
93 |
+
|
94 |
+
## Available models
|
95 |
+
This is Whisper's original VRAM usage table for models.
|
96 |
+
|
97 |
+
| Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
|
98 |
+
|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
|
99 |
+
| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
|
100 |
+
| base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
|
101 |
+
| small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
|
102 |
+
| medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
|
103 |
+
| large | 1550 M | N/A | `large` | ~10 GB | 1x |
|
104 |
+
|
105 |
+
|
106 |
+
`.en` models are for English only, and the cool thing is that you can use the `Translate to English` option from the "large" models!
|
107 |
+
|
108 |
+
## TODO🗓
|
109 |
+
|
110 |
+
- [x] Add DeepL API translation
|
111 |
+
- [x] Add NLLB Model translation
|
112 |
+
- [x] Integrate with faster-whisper
|
113 |
+
- [x] Integrate with insanely-fast-whisper
|
114 |
+
- [x] Integrate with whisperX ( Only speaker diarization part )
|
115 |
+
- [x] Add background music separation pre-processing with [UVR](https://github.com/Anjok07/ultimatevocalremovergui)
|
116 |
+
- [ ] Add fast api script
|
117 |
+
- [ ] Support real-time transcription for microphone
|
app.py
ADDED
@@ -0,0 +1,359 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import gradio as gr
|
4 |
+
import yaml
|
5 |
+
|
6 |
+
from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
|
7 |
+
INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
8 |
+
UVR_MODELS_DIR)
|
9 |
+
from modules.utils.files_manager import load_yaml
|
10 |
+
from modules.whisper.whisper_factory import WhisperFactory
|
11 |
+
from modules.whisper.faster_whisper_inference import FasterWhisperInference
|
12 |
+
from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
|
13 |
+
from modules.translation.nllb_inference import NLLBInference
|
14 |
+
from modules.ui.htmls import *
|
15 |
+
from modules.utils.cli_manager import str2bool
|
16 |
+
from modules.utils.youtube_manager import get_ytmetas
|
17 |
+
from modules.translation.deepl_api import DeepLAPI
|
18 |
+
from modules.whisper.whisper_parameter import *
|
19 |
+
|
20 |
+
### Device info ###
|
21 |
+
import torch
|
22 |
+
import torchaudio
|
23 |
+
import torch.cuda as cuda
|
24 |
+
import platform
|
25 |
+
from transformers import __version__ as transformers_version
|
26 |
+
|
27 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
28 |
+
num_gpus = cuda.device_count() if torch.cuda.is_available() else 0
|
29 |
+
cuda_version = torch.version.cuda if torch.cuda.is_available() else "N/A"
|
30 |
+
cudnn_version = torch.backends.cudnn.version() if torch.cuda.is_available() else "N/A"
|
31 |
+
os_info = platform.system() + " " + platform.release() + " " + platform.machine()
|
32 |
+
|
33 |
+
# Get the available VRAM for each GPU (if available)
|
34 |
+
vram_info = []
|
35 |
+
if torch.cuda.is_available():
|
36 |
+
for i in range(cuda.device_count()):
|
37 |
+
gpu_properties = cuda.get_device_properties(i)
|
38 |
+
vram_info.append(f"**GPU {i}: {gpu_properties.total_memory / 1024**3:.2f} GB**")
|
39 |
+
|
40 |
+
pytorch_version = torch.__version__
|
41 |
+
torchaudio_version = torchaudio.__version__ if 'torchaudio' in dir() else "N/A"
|
42 |
+
|
43 |
+
device_info = f"""Running on: **{device}**
|
44 |
+
|
45 |
+
Number of GPUs available: **{num_gpus}**
|
46 |
+
|
47 |
+
CUDA version: **{cuda_version}**
|
48 |
+
|
49 |
+
CuDNN version: **{cudnn_version}**
|
50 |
+
|
51 |
+
PyTorch version: **{pytorch_version}**
|
52 |
+
|
53 |
+
Torchaudio version: **{torchaudio_version}**
|
54 |
+
|
55 |
+
Transformers version: **{transformers_version}**
|
56 |
+
|
57 |
+
Operating system: **{os_info}**
|
58 |
+
|
59 |
+
Available VRAM:
|
60 |
+
\t {', '.join(vram_info) if vram_info else '**N/A**'}
|
61 |
+
"""
|
62 |
+
### End Device info ###
|
63 |
+
|
64 |
+
class App:
|
65 |
+
def __init__(self, args):
|
66 |
+
self.args = args
|
67 |
+
#self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
|
68 |
+
self.app = gr.Blocks(css=CSS, theme=gr.themes.Ocean(), delete_cache=(60, 3600))
|
69 |
+
self.whisper_inf = WhisperFactory.create_whisper_inference(
|
70 |
+
whisper_type=self.args.whisper_type,
|
71 |
+
whisper_model_dir=self.args.whisper_model_dir,
|
72 |
+
faster_whisper_model_dir=self.args.faster_whisper_model_dir,
|
73 |
+
insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
|
74 |
+
uvr_model_dir=self.args.uvr_model_dir,
|
75 |
+
output_dir=self.args.output_dir,
|
76 |
+
)
|
77 |
+
self.nllb_inf = NLLBInference(
|
78 |
+
model_dir=self.args.nllb_model_dir,
|
79 |
+
output_dir=os.path.join(self.args.output_dir, "translations")
|
80 |
+
)
|
81 |
+
self.deepl_api = DeepLAPI(
|
82 |
+
output_dir=os.path.join(self.args.output_dir, "translations")
|
83 |
+
)
|
84 |
+
self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
85 |
+
print(f"Use \"{self.args.whisper_type}\" implementation")
|
86 |
+
print(f"Device \"{self.whisper_inf.device}\" is detected")
|
87 |
+
|
88 |
+
def create_whisper_parameters(self):
|
89 |
+
|
90 |
+
whisper_params = self.default_params["whisper"]
|
91 |
+
diarization_params = self.default_params["diarization"]
|
92 |
+
vad_params = self.default_params["vad"]
|
93 |
+
uvr_params = self.default_params["bgm_separation"]
|
94 |
+
|
95 |
+
with gr.Row():
|
96 |
+
dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],label="Model")
|
97 |
+
dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,value=whisper_params["lang"], label="Language")
|
98 |
+
#dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
99 |
+
dd_file_format = gr.Dropdown(choices=["SRT", "txt"], value="SRT", label="Output format")
|
100 |
+
|
101 |
+
with gr.Row():
|
102 |
+
cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add timestamp to output file",interactive=True)
|
103 |
+
cb_diarize = gr.Checkbox(label="Speaker diarization", value=diarization_params["is_diarize"])
|
104 |
+
cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English",interactive=True)
|
105 |
+
|
106 |
+
with gr.Accordion("Diarization options", open=False):
|
107 |
+
tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
|
108 |
+
info="This is only needed the first time you download the model. If you already have"
|
109 |
+
" models, you don't need to enter. To download the model, you must manually go "
|
110 |
+
"to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
|
111 |
+
" their requirement.")
|
112 |
+
dd_diarization_device = gr.Dropdown(label="Device",
|
113 |
+
choices=self.whisper_inf.diarizer.get_available_device(),
|
114 |
+
value=self.whisper_inf.diarizer.get_device())
|
115 |
+
|
116 |
+
with gr.Accordion("Advanced options", open=False):
|
117 |
+
nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
|
118 |
+
info="Beam size to use for decoding.")
|
119 |
+
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
|
120 |
+
info="If the average log probability over sampled tokens is below this value, treat as failed.")
|
121 |
+
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
|
122 |
+
info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
|
123 |
+
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
|
124 |
+
value=self.whisper_inf.current_compute_type, interactive=True,
|
125 |
+
allow_custom_value=True,
|
126 |
+
info="Select the type of computation to perform.")
|
127 |
+
nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
|
128 |
+
info="Number of candidates when sampling with non-zero temperature.")
|
129 |
+
nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
|
130 |
+
info="Beam search patience factor.")
|
131 |
+
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
|
132 |
+
interactive=True,
|
133 |
+
info="Condition on previous text during decoding.")
|
134 |
+
sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
|
135 |
+
minimum=0, maximum=1, step=0.01, interactive=True,
|
136 |
+
info="Resets prompt if temperature is above this value."
|
137 |
+
" Arg has effect only if 'Condition On Previous Text' is True.")
|
138 |
+
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
|
139 |
+
info="Initial prompt to use for decoding.")
|
140 |
+
sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
|
141 |
+
step=0.01, maximum=1.0, interactive=True,
|
142 |
+
info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
|
143 |
+
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
|
144 |
+
interactive=True,
|
145 |
+
info="If the gzip compression ratio is above this value, treat as failed.")
|
146 |
+
nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
|
147 |
+
precision=0,
|
148 |
+
info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
|
149 |
+
with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
150 |
+
nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
|
151 |
+
info="Exponential length penalty constant.")
|
152 |
+
nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
|
153 |
+
info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
|
154 |
+
nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
|
155 |
+
precision=0,
|
156 |
+
info="Prevent repetitions of n-grams with this size (set 0 to disable).")
|
157 |
+
tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
|
158 |
+
info="Optional text to provide as a prefix for the first window.")
|
159 |
+
cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=whisper_params["suppress_blank"],
|
160 |
+
info="Suppress blank outputs at the beginning of the sampling.")
|
161 |
+
tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
|
162 |
+
info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
|
163 |
+
nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
|
164 |
+
info="The initial timestamp cannot be later than this.")
|
165 |
+
cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
|
166 |
+
info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
|
167 |
+
tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
|
168 |
+
info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
|
169 |
+
tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
|
170 |
+
info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
|
171 |
+
nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
|
172 |
+
precision=0,
|
173 |
+
info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
|
174 |
+
nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
|
175 |
+
value=lambda: whisper_params["hallucination_silence_threshold"],
|
176 |
+
info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
|
177 |
+
tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
|
178 |
+
info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
|
179 |
+
nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
|
180 |
+
info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
|
181 |
+
nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
|
182 |
+
precision=0,
|
183 |
+
info="Number of segments to consider for the language detection.")
|
184 |
+
with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
185 |
+
nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
|
186 |
+
|
187 |
+
with gr.Accordion("Background Music Remover Filter", open=False):
|
188 |
+
cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
|
189 |
+
interactive=True,
|
190 |
+
info="Enabling this will remove background music by submodel before"
|
191 |
+
" transcribing ")
|
192 |
+
dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
|
193 |
+
choices=self.whisper_inf.music_separator.available_devices)
|
194 |
+
dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
|
195 |
+
choices=self.whisper_inf.music_separator.available_models)
|
196 |
+
nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
|
197 |
+
cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
|
198 |
+
cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
|
199 |
+
value=uvr_params["enable_offload"])
|
200 |
+
|
201 |
+
with gr.Accordion("Voice Detection Filter", open=False):
|
202 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
|
203 |
+
interactive=True,
|
204 |
+
info="Enable this to transcribe only detected voice parts by submodel.")
|
205 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
|
206 |
+
value=vad_params["threshold"],
|
207 |
+
info="Lower it to be more sensitive to small sounds.")
|
208 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
|
209 |
+
value=vad_params["min_speech_duration_ms"],
|
210 |
+
info="Final speech chunks shorter than this time are thrown out")
|
211 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
|
212 |
+
value=vad_params["max_speech_duration_s"],
|
213 |
+
info="Maximum duration of speech chunks in \"seconds\".")
|
214 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
|
215 |
+
value=vad_params["min_silence_duration_ms"],
|
216 |
+
info="In the end of each speech chunk wait for this time"
|
217 |
+
" before separating it")
|
218 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
|
219 |
+
info="Final speech chunks are padded by this time each side")
|
220 |
+
|
221 |
+
dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
|
222 |
+
|
223 |
+
return (
|
224 |
+
WhisperParameters(
|
225 |
+
model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
|
226 |
+
log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
|
227 |
+
compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
|
228 |
+
condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
|
229 |
+
temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
|
230 |
+
vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
|
231 |
+
max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
|
232 |
+
speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
|
233 |
+
is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
|
234 |
+
length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
|
235 |
+
no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
|
236 |
+
suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
|
237 |
+
word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
|
238 |
+
append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
|
239 |
+
hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
|
240 |
+
language_detection_threshold=nb_language_detection_threshold,
|
241 |
+
language_detection_segments=nb_language_detection_segments,
|
242 |
+
prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
|
243 |
+
uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
|
244 |
+
uvr_save_file=cb_uvr_save_file, uvr_enable_offload=cb_uvr_enable_offload
|
245 |
+
),
|
246 |
+
dd_file_format,
|
247 |
+
cb_timestamp
|
248 |
+
)
|
249 |
+
|
250 |
+
def launch(self):
|
251 |
+
translation_params = self.default_params["translation"]
|
252 |
+
deepl_params = translation_params["deepl"]
|
253 |
+
nllb_params = translation_params["nllb"]
|
254 |
+
uvr_params = self.default_params["bgm_separation"]
|
255 |
+
|
256 |
+
with self.app:
|
257 |
+
with gr.Row():
|
258 |
+
with gr.Column():
|
259 |
+
gr.Markdown(MARKDOWN, elem_id="md_project")
|
260 |
+
with gr.Tabs():
|
261 |
+
with gr.TabItem("Audio"): # tab1
|
262 |
+
with gr.Column():
|
263 |
+
#input_file = gr.Files(type="filepath", label="Upload File here")
|
264 |
+
input_file = gr.Audio(type='filepath', elem_id="audio_input")
|
265 |
+
tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
|
266 |
+
info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
|
267 |
+
" Leave this field empty if you do not wish to use a local path.",
|
268 |
+
visible=self.args.colab,
|
269 |
+
value="")
|
270 |
+
|
271 |
+
whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
|
272 |
+
|
273 |
+
with gr.Row():
|
274 |
+
btn_run = gr.Button("Transcribe", variant="primary")
|
275 |
+
btn_reset = gr.Button(value="Reset")
|
276 |
+
btn_reset.click(None,js="window.location.reload()")
|
277 |
+
with gr.Row():
|
278 |
+
with gr.Column(scale=3):
|
279 |
+
tb_indicator = gr.Textbox(label="Output result")
|
280 |
+
with gr.Column(scale=1):
|
281 |
+
tb_info = gr.Textbox(label="Output info", interactive=False, scale=3)
|
282 |
+
files_subtitles = gr.Files(label="Output file", interactive=False, scale=2)
|
283 |
+
# btn_openfolder = gr.Button('📂', scale=1)
|
284 |
+
|
285 |
+
params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
|
286 |
+
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
287 |
+
inputs=params + whisper_params.as_list(),
|
288 |
+
outputs=[tb_indicator, files_subtitles, tb_info])
|
289 |
+
# btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
|
290 |
+
|
291 |
+
with gr.TabItem("Device info"): # tab2
|
292 |
+
with gr.Column():
|
293 |
+
gr.Markdown(device_info, label="Hardware info & installed packages")
|
294 |
+
|
295 |
+
# Launch the app with optional gradio settings
|
296 |
+
args = self.args
|
297 |
+
|
298 |
+
self.app.queue(
|
299 |
+
api_open=args.api_open
|
300 |
+
).launch(
|
301 |
+
share=args.share,
|
302 |
+
server_name=args.server_name,
|
303 |
+
server_port=args.server_port,
|
304 |
+
auth=(args.username, args.password) if args.username and args.password else None,
|
305 |
+
root_path=args.root_path,
|
306 |
+
inbrowser=args.inbrowser
|
307 |
+
)
|
308 |
+
|
309 |
+
@staticmethod
|
310 |
+
def open_folder(folder_path: str):
|
311 |
+
if os.path.exists(folder_path):
|
312 |
+
os.system(f"start {folder_path}")
|
313 |
+
else:
|
314 |
+
os.makedirs(folder_path, exist_ok=True)
|
315 |
+
print(f"The directory path {folder_path} has newly created.")
|
316 |
+
|
317 |
+
@staticmethod
|
318 |
+
def on_change_models(model_size: str):
|
319 |
+
translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
|
320 |
+
if model_size not in translatable_model:
|
321 |
+
return gr.Checkbox(visible=False, value=False, interactive=False)
|
322 |
+
#return gr.Checkbox(visible=True, value=False, label="Translate to English (large models only)", interactive=False)
|
323 |
+
else:
|
324 |
+
return gr.Checkbox(visible=True, value=False, label="Translate to English", interactive=True)
|
325 |
+
|
326 |
+
|
327 |
+
# Create the parser for command-line arguments
|
328 |
+
parser = argparse.ArgumentParser()
|
329 |
+
parser.add_argument('--whisper_type', type=str, default="faster-whisper",
|
330 |
+
help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
|
331 |
+
parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
|
332 |
+
parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
|
333 |
+
parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
|
334 |
+
parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
|
335 |
+
parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
|
336 |
+
parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
|
337 |
+
parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
|
338 |
+
parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
|
339 |
+
parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
|
340 |
+
parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
|
341 |
+
parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
|
342 |
+
help='Directory path of the whisper model')
|
343 |
+
parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
|
344 |
+
help='Directory path of the faster-whisper model')
|
345 |
+
parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
|
346 |
+
default=INSANELY_FAST_WHISPER_MODELS_DIR,
|
347 |
+
help='Directory path of the insanely-fast-whisper model')
|
348 |
+
parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MODELS_DIR,
|
349 |
+
help='Directory path of the diarization model')
|
350 |
+
parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
|
351 |
+
help='Directory path of the Facebook NLLB model')
|
352 |
+
parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
|
353 |
+
help='Directory path of the UVR model')
|
354 |
+
parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
|
355 |
+
_args = parser.parse_args()
|
356 |
+
|
357 |
+
if __name__ == "__main__":
|
358 |
+
app = App(args=_args)
|
359 |
+
app.launch()
|
configs/default_parameters.yaml
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
whisper:
|
2 |
+
model_size: "large-v3"
|
3 |
+
lang: "Automatic Detection"
|
4 |
+
is_translate: false
|
5 |
+
beam_size: 5
|
6 |
+
log_prob_threshold: -1
|
7 |
+
no_speech_threshold: 0.6
|
8 |
+
best_of: 5
|
9 |
+
patience: 1
|
10 |
+
condition_on_previous_text: true
|
11 |
+
prompt_reset_on_temperature: 0.5
|
12 |
+
initial_prompt: null
|
13 |
+
temperature: 0
|
14 |
+
compression_ratio_threshold: 2.4
|
15 |
+
chunk_length: 30
|
16 |
+
batch_size: 24
|
17 |
+
length_penalty: 1
|
18 |
+
repetition_penalty: 1
|
19 |
+
no_repeat_ngram_size: 0
|
20 |
+
prefix: null
|
21 |
+
suppress_blank: true
|
22 |
+
suppress_tokens: "[-1]"
|
23 |
+
max_initial_timestamp: 1
|
24 |
+
word_timestamps: false
|
25 |
+
prepend_punctuations: "\"'“¿([{-"
|
26 |
+
append_punctuations: "\"'.。,,!!??::”)]}、"
|
27 |
+
max_new_tokens: null
|
28 |
+
hallucination_silence_threshold: null
|
29 |
+
hotwords: null
|
30 |
+
language_detection_threshold: null
|
31 |
+
language_detection_segments: 1
|
32 |
+
add_timestamp: false
|
33 |
+
|
34 |
+
vad:
|
35 |
+
vad_filter: false
|
36 |
+
threshold: 0.5
|
37 |
+
min_speech_duration_ms: 250
|
38 |
+
max_speech_duration_s: 9999
|
39 |
+
min_silence_duration_ms: 1000
|
40 |
+
speech_pad_ms: 2000
|
41 |
+
|
42 |
+
diarization:
|
43 |
+
is_diarize: false
|
44 |
+
hf_token: ""
|
45 |
+
|
46 |
+
bgm_separation:
|
47 |
+
is_separate_bgm: false
|
48 |
+
model_size: "UVR-MDX-NET-Inst_HQ_4"
|
49 |
+
segment_size: 256
|
50 |
+
save_file: false
|
51 |
+
enable_offload: true
|
52 |
+
|
53 |
+
translation:
|
54 |
+
deepl:
|
55 |
+
api_key: ""
|
56 |
+
is_pro: false
|
57 |
+
source_lang: "Automatic Detection"
|
58 |
+
target_lang: "English"
|
59 |
+
nllb:
|
60 |
+
model_size: "facebook/nllb-200-1.3B"
|
61 |
+
source_lang: null
|
62 |
+
target_lang: null
|
63 |
+
max_length: 200
|
64 |
+
add_timestamp: true
|
demo/audio.wav
ADDED
Binary file (209 kB). View file
|
|
docker-compose.yaml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
services:
|
2 |
+
app:
|
3 |
+
build: .
|
4 |
+
image: whisper-webui:latest
|
5 |
+
|
6 |
+
volumes:
|
7 |
+
# Update paths to mount models and output paths to your custom paths like this, e.g:
|
8 |
+
# - C:/whisper-models/custom-path:/Whisper-WebUI/models
|
9 |
+
# - C:/whisper-webui-outputs/custom-path:/Whisper-WebUI/outputs
|
10 |
+
- /Whisper-WebUI/models
|
11 |
+
- /Whisper-WebUI/outputs
|
12 |
+
|
13 |
+
ports:
|
14 |
+
- "7860:7860"
|
15 |
+
|
16 |
+
stdin_open: true
|
17 |
+
tty: true
|
18 |
+
|
19 |
+
entrypoint: ["python", "app.py", "--server_port", "7860", "--server_name", "0.0.0.0",]
|
20 |
+
|
21 |
+
# If you're not using nvidia GPU, Update device to match yours.
|
22 |
+
# See more info at : https://docs.docker.com/compose/compose-file/deploy/#driver
|
23 |
+
deploy:
|
24 |
+
resources:
|
25 |
+
reservations:
|
26 |
+
devices:
|
27 |
+
- driver: nvidia
|
28 |
+
count: all
|
29 |
+
capabilities: [ gpu ]
|
models/models will be saved here.txt
ADDED
File without changes
|
modules/__init__.py
ADDED
File without changes
|
modules/diarize/__init__.py
ADDED
File without changes
|
modules/diarize/audio_loader.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
|
2 |
+
|
3 |
+
import os
|
4 |
+
import subprocess
|
5 |
+
from functools import lru_cache
|
6 |
+
from typing import Optional, Union
|
7 |
+
from scipy.io.wavfile import write
|
8 |
+
import tempfile
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import torch
|
12 |
+
import torch.nn.functional as F
|
13 |
+
|
14 |
+
def exact_div(x, y):
|
15 |
+
assert x % y == 0
|
16 |
+
return x // y
|
17 |
+
|
18 |
+
# hard-coded audio hyperparameters
|
19 |
+
SAMPLE_RATE = 16000
|
20 |
+
N_FFT = 400
|
21 |
+
HOP_LENGTH = 160
|
22 |
+
CHUNK_LENGTH = 30
|
23 |
+
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
|
24 |
+
N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input
|
25 |
+
|
26 |
+
N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
|
27 |
+
FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
|
28 |
+
TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
|
29 |
+
|
30 |
+
|
31 |
+
def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
|
32 |
+
"""
|
33 |
+
Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
|
34 |
+
|
35 |
+
Parameters
|
36 |
+
----------
|
37 |
+
file: Union[str, np.ndarray]
|
38 |
+
The audio file to open or a numpy array containing the audio data.
|
39 |
+
|
40 |
+
sr: int
|
41 |
+
The sample rate to resample the audio if necessary.
|
42 |
+
|
43 |
+
Returns
|
44 |
+
-------
|
45 |
+
A NumPy array containing the audio waveform, in float32 dtype.
|
46 |
+
"""
|
47 |
+
if isinstance(file, np.ndarray):
|
48 |
+
if file.dtype != np.float32:
|
49 |
+
file = file.astype(np.float32)
|
50 |
+
if file.ndim > 1:
|
51 |
+
file = np.mean(file, axis=1)
|
52 |
+
|
53 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
54 |
+
write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
|
55 |
+
temp_file_path = temp_file.name
|
56 |
+
temp_file.close()
|
57 |
+
else:
|
58 |
+
temp_file_path = file
|
59 |
+
|
60 |
+
try:
|
61 |
+
cmd = [
|
62 |
+
"ffmpeg",
|
63 |
+
"-nostdin",
|
64 |
+
"-threads",
|
65 |
+
"0",
|
66 |
+
"-i",
|
67 |
+
temp_file_path,
|
68 |
+
"-f",
|
69 |
+
"s16le",
|
70 |
+
"-ac",
|
71 |
+
"1",
|
72 |
+
"-acodec",
|
73 |
+
"pcm_s16le",
|
74 |
+
"-ar",
|
75 |
+
str(sr),
|
76 |
+
"-",
|
77 |
+
]
|
78 |
+
out = subprocess.run(cmd, capture_output=True, check=True).stdout
|
79 |
+
except subprocess.CalledProcessError as e:
|
80 |
+
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
81 |
+
finally:
|
82 |
+
if isinstance(file, np.ndarray):
|
83 |
+
os.remove(temp_file_path)
|
84 |
+
|
85 |
+
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|
86 |
+
|
87 |
+
|
88 |
+
def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
|
89 |
+
"""
|
90 |
+
Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
|
91 |
+
"""
|
92 |
+
if torch.is_tensor(array):
|
93 |
+
if array.shape[axis] > length:
|
94 |
+
array = array.index_select(
|
95 |
+
dim=axis, index=torch.arange(length, device=array.device)
|
96 |
+
)
|
97 |
+
|
98 |
+
if array.shape[axis] < length:
|
99 |
+
pad_widths = [(0, 0)] * array.ndim
|
100 |
+
pad_widths[axis] = (0, length - array.shape[axis])
|
101 |
+
array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
|
102 |
+
else:
|
103 |
+
if array.shape[axis] > length:
|
104 |
+
array = array.take(indices=range(length), axis=axis)
|
105 |
+
|
106 |
+
if array.shape[axis] < length:
|
107 |
+
pad_widths = [(0, 0)] * array.ndim
|
108 |
+
pad_widths[axis] = (0, length - array.shape[axis])
|
109 |
+
array = np.pad(array, pad_widths)
|
110 |
+
|
111 |
+
return array
|
112 |
+
|
113 |
+
|
114 |
+
@lru_cache(maxsize=None)
|
115 |
+
def mel_filters(device, n_mels: int) -> torch.Tensor:
|
116 |
+
"""
|
117 |
+
load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
|
118 |
+
Allows decoupling librosa dependency; saved using:
|
119 |
+
|
120 |
+
np.savez_compressed(
|
121 |
+
"mel_filters.npz",
|
122 |
+
mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
|
123 |
+
)
|
124 |
+
"""
|
125 |
+
assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
|
126 |
+
with np.load(
|
127 |
+
os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
|
128 |
+
) as f:
|
129 |
+
return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
|
130 |
+
|
131 |
+
|
132 |
+
def log_mel_spectrogram(
|
133 |
+
audio: Union[str, np.ndarray, torch.Tensor],
|
134 |
+
n_mels: int,
|
135 |
+
padding: int = 0,
|
136 |
+
device: Optional[Union[str, torch.device]] = None,
|
137 |
+
):
|
138 |
+
"""
|
139 |
+
Compute the log-Mel spectrogram of
|
140 |
+
|
141 |
+
Parameters
|
142 |
+
----------
|
143 |
+
audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
|
144 |
+
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
|
145 |
+
|
146 |
+
n_mels: int
|
147 |
+
The number of Mel-frequency filters, only 80 is supported
|
148 |
+
|
149 |
+
padding: int
|
150 |
+
Number of zero samples to pad to the right
|
151 |
+
|
152 |
+
device: Optional[Union[str, torch.device]]
|
153 |
+
If given, the audio tensor is moved to this device before STFT
|
154 |
+
|
155 |
+
Returns
|
156 |
+
-------
|
157 |
+
torch.Tensor, shape = (80, n_frames)
|
158 |
+
A Tensor that contains the Mel spectrogram
|
159 |
+
"""
|
160 |
+
if not torch.is_tensor(audio):
|
161 |
+
if isinstance(audio, str):
|
162 |
+
audio = load_audio(audio)
|
163 |
+
audio = torch.from_numpy(audio)
|
164 |
+
|
165 |
+
if device is not None:
|
166 |
+
audio = audio.to(device)
|
167 |
+
if padding > 0:
|
168 |
+
audio = F.pad(audio, (0, padding))
|
169 |
+
window = torch.hann_window(N_FFT).to(audio.device)
|
170 |
+
stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
|
171 |
+
magnitudes = stft[..., :-1].abs() ** 2
|
172 |
+
|
173 |
+
filters = mel_filters(audio.device, n_mels)
|
174 |
+
mel_spec = filters @ magnitudes
|
175 |
+
|
176 |
+
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
|
177 |
+
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
|
178 |
+
log_spec = (log_spec + 4.0) / 4.0
|
179 |
+
return log_spec
|
modules/diarize/diarize_pipeline.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import pandas as pd
|
5 |
+
import os
|
6 |
+
from pyannote.audio import Pipeline
|
7 |
+
from typing import Optional, Union
|
8 |
+
import torch
|
9 |
+
|
10 |
+
from modules.utils.paths import DIARIZATION_MODELS_DIR
|
11 |
+
from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
|
12 |
+
|
13 |
+
|
14 |
+
class DiarizationPipeline:
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
model_name="pyannote/speaker-diarization-3.1",
|
18 |
+
cache_dir: str = DIARIZATION_MODELS_DIR,
|
19 |
+
use_auth_token=None,
|
20 |
+
device: Optional[Union[str, torch.device]] = "cpu",
|
21 |
+
):
|
22 |
+
if isinstance(device, str):
|
23 |
+
device = torch.device(device)
|
24 |
+
self.model = Pipeline.from_pretrained(
|
25 |
+
model_name,
|
26 |
+
use_auth_token=use_auth_token,
|
27 |
+
cache_dir=cache_dir
|
28 |
+
).to(device)
|
29 |
+
|
30 |
+
def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
|
31 |
+
if isinstance(audio, str):
|
32 |
+
audio = load_audio(audio)
|
33 |
+
audio_data = {
|
34 |
+
'waveform': torch.from_numpy(audio[None, :]),
|
35 |
+
'sample_rate': SAMPLE_RATE
|
36 |
+
}
|
37 |
+
segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
|
38 |
+
diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
|
39 |
+
diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
|
40 |
+
diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
|
41 |
+
return diarize_df
|
42 |
+
|
43 |
+
|
44 |
+
def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
|
45 |
+
transcript_segments = transcript_result["segments"]
|
46 |
+
for seg in transcript_segments:
|
47 |
+
# assign speaker to segment (if any)
|
48 |
+
diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
|
49 |
+
seg['start'])
|
50 |
+
diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
|
51 |
+
|
52 |
+
intersected = diarize_df[diarize_df["intersection"] > 0]
|
53 |
+
|
54 |
+
speaker = None
|
55 |
+
if len(intersected) > 0:
|
56 |
+
# Choosing most strong intersection
|
57 |
+
speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
|
58 |
+
elif fill_nearest:
|
59 |
+
# Otherwise choosing closest
|
60 |
+
speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
|
61 |
+
|
62 |
+
if speaker is not None:
|
63 |
+
seg["speaker"] = speaker
|
64 |
+
|
65 |
+
# assign speaker to words
|
66 |
+
if 'words' in seg:
|
67 |
+
for word in seg['words']:
|
68 |
+
if 'start' in word:
|
69 |
+
diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
|
70 |
+
diarize_df['start'], word['start'])
|
71 |
+
diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
|
72 |
+
word['start'])
|
73 |
+
|
74 |
+
intersected = diarize_df[diarize_df["intersection"] > 0]
|
75 |
+
|
76 |
+
word_speaker = None
|
77 |
+
if len(intersected) > 0:
|
78 |
+
# Choosing most strong intersection
|
79 |
+
word_speaker = \
|
80 |
+
intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
|
81 |
+
elif fill_nearest:
|
82 |
+
# Otherwise choosing closest
|
83 |
+
word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
|
84 |
+
|
85 |
+
if word_speaker is not None:
|
86 |
+
word["speaker"] = word_speaker
|
87 |
+
|
88 |
+
return transcript_result
|
89 |
+
|
90 |
+
|
91 |
+
class Segment:
|
92 |
+
def __init__(self, start, end, speaker=None):
|
93 |
+
self.start = start
|
94 |
+
self.end = end
|
95 |
+
self.speaker = speaker
|
modules/diarize/diarizer.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from typing import List, Union, BinaryIO, Optional
|
4 |
+
import numpy as np
|
5 |
+
import time
|
6 |
+
import logging
|
7 |
+
|
8 |
+
from modules.utils.paths import DIARIZATION_MODELS_DIR
|
9 |
+
from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
|
10 |
+
from modules.diarize.audio_loader import load_audio
|
11 |
+
|
12 |
+
|
13 |
+
class Diarizer:
|
14 |
+
def __init__(self,
|
15 |
+
model_dir: str = DIARIZATION_MODELS_DIR
|
16 |
+
):
|
17 |
+
self.device = self.get_device()
|
18 |
+
self.available_device = self.get_available_device()
|
19 |
+
self.compute_type = "float16"
|
20 |
+
self.model_dir = model_dir
|
21 |
+
os.makedirs(self.model_dir, exist_ok=True)
|
22 |
+
self.pipe = None
|
23 |
+
|
24 |
+
def run(self,
|
25 |
+
audio: Union[str, BinaryIO, np.ndarray],
|
26 |
+
transcribed_result: List[dict],
|
27 |
+
use_auth_token: str,
|
28 |
+
device: Optional[str] = None
|
29 |
+
):
|
30 |
+
"""
|
31 |
+
Diarize transcribed result as a post-processing
|
32 |
+
|
33 |
+
Parameters
|
34 |
+
----------
|
35 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
36 |
+
Audio input. This can be file path or binary type.
|
37 |
+
transcribed_result: List[dict]
|
38 |
+
transcribed result through whisper.
|
39 |
+
use_auth_token: str
|
40 |
+
Huggingface token with READ permission. This is only needed the first time you download the model.
|
41 |
+
You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
|
42 |
+
device: Optional[str]
|
43 |
+
Device for diarization.
|
44 |
+
|
45 |
+
Returns
|
46 |
+
----------
|
47 |
+
segments_result: List[dict]
|
48 |
+
list of dicts that includes start, end timestamps and transcribed text
|
49 |
+
elapsed_time: float
|
50 |
+
elapsed time for running
|
51 |
+
"""
|
52 |
+
start_time = time.time()
|
53 |
+
|
54 |
+
if device is None:
|
55 |
+
device = self.device
|
56 |
+
|
57 |
+
if device != self.device or self.pipe is None:
|
58 |
+
self.update_pipe(
|
59 |
+
device=device,
|
60 |
+
use_auth_token=use_auth_token
|
61 |
+
)
|
62 |
+
|
63 |
+
audio = load_audio(audio)
|
64 |
+
|
65 |
+
diarization_segments = self.pipe(audio)
|
66 |
+
diarized_result = assign_word_speakers(
|
67 |
+
diarization_segments,
|
68 |
+
{"segments": transcribed_result}
|
69 |
+
)
|
70 |
+
|
71 |
+
for segment in diarized_result["segments"]:
|
72 |
+
speaker = "None"
|
73 |
+
if "speaker" in segment:
|
74 |
+
speaker = segment["speaker"]
|
75 |
+
segment["text"] = speaker + ": " + segment["text"].strip()
|
76 |
+
|
77 |
+
elapsed_time = time.time() - start_time
|
78 |
+
return diarized_result["segments"], elapsed_time
|
79 |
+
|
80 |
+
def update_pipe(self,
|
81 |
+
use_auth_token: str,
|
82 |
+
device: str
|
83 |
+
):
|
84 |
+
"""
|
85 |
+
Set pipeline for diarization
|
86 |
+
|
87 |
+
Parameters
|
88 |
+
----------
|
89 |
+
use_auth_token: str
|
90 |
+
Huggingface token with READ permission. This is only needed the first time you download the model.
|
91 |
+
You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
|
92 |
+
device: str
|
93 |
+
Device for diarization.
|
94 |
+
"""
|
95 |
+
self.device = device
|
96 |
+
|
97 |
+
os.makedirs(self.model_dir, exist_ok=True)
|
98 |
+
|
99 |
+
if (not os.listdir(self.model_dir) and
|
100 |
+
not use_auth_token):
|
101 |
+
print(
|
102 |
+
"\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
|
103 |
+
"Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
|
104 |
+
)
|
105 |
+
return
|
106 |
+
|
107 |
+
logger = logging.getLogger("speechbrain.utils.train_logger")
|
108 |
+
# Disable redundant torchvision warning message
|
109 |
+
logger.disabled = True
|
110 |
+
self.pipe = DiarizationPipeline(
|
111 |
+
use_auth_token=use_auth_token,
|
112 |
+
device=device,
|
113 |
+
cache_dir=self.model_dir
|
114 |
+
)
|
115 |
+
logger.disabled = False
|
116 |
+
|
117 |
+
@staticmethod
|
118 |
+
def get_device():
|
119 |
+
if torch.cuda.is_available():
|
120 |
+
return "cuda"
|
121 |
+
elif torch.backends.mps.is_available():
|
122 |
+
return "mps"
|
123 |
+
else:
|
124 |
+
return "cpu"
|
125 |
+
|
126 |
+
@staticmethod
|
127 |
+
def get_available_device():
|
128 |
+
devices = ["cpu"]
|
129 |
+
if torch.cuda.is_available():
|
130 |
+
devices.append("cuda")
|
131 |
+
elif torch.backends.mps.is_available():
|
132 |
+
devices.append("mps")
|
133 |
+
return devices
|
modules/translation/__init__.py
ADDED
File without changes
|
modules/translation/deepl_api.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
from datetime import datetime
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
|
8 |
+
from modules.utils.subtitle_manager import *
|
9 |
+
from modules.utils.files_manager import load_yaml, save_yaml
|
10 |
+
|
11 |
+
"""
|
12 |
+
This is written with reference to the DeepL API documentation.
|
13 |
+
If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
|
14 |
+
"""
|
15 |
+
|
16 |
+
DEEPL_AVAILABLE_TARGET_LANGS = {
|
17 |
+
'Bulgarian': 'BG',
|
18 |
+
'Czech': 'CS',
|
19 |
+
'Danish': 'DA',
|
20 |
+
'German': 'DE',
|
21 |
+
'Greek': 'EL',
|
22 |
+
'English': 'EN',
|
23 |
+
'English (British)': 'EN-GB',
|
24 |
+
'English (American)': 'EN-US',
|
25 |
+
'Spanish': 'ES',
|
26 |
+
'Estonian': 'ET',
|
27 |
+
'Finnish': 'FI',
|
28 |
+
'French': 'FR',
|
29 |
+
'Hungarian': 'HU',
|
30 |
+
'Indonesian': 'ID',
|
31 |
+
'Italian': 'IT',
|
32 |
+
'Japanese': 'JA',
|
33 |
+
'Korean': 'KO',
|
34 |
+
'Lithuanian': 'LT',
|
35 |
+
'Latvian': 'LV',
|
36 |
+
'Norwegian (Bokmål)': 'NB',
|
37 |
+
'Dutch': 'NL',
|
38 |
+
'Polish': 'PL',
|
39 |
+
'Portuguese': 'PT',
|
40 |
+
'Portuguese (Brazilian)': 'PT-BR',
|
41 |
+
'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
|
42 |
+
'Romanian': 'RO',
|
43 |
+
'Russian': 'RU',
|
44 |
+
'Slovak': 'SK',
|
45 |
+
'Slovenian': 'SL',
|
46 |
+
'Swedish': 'SV',
|
47 |
+
'Turkish': 'TR',
|
48 |
+
'Ukrainian': 'UK',
|
49 |
+
'Chinese (simplified)': 'ZH'
|
50 |
+
}
|
51 |
+
|
52 |
+
DEEPL_AVAILABLE_SOURCE_LANGS = {
|
53 |
+
'Automatic Detection': None,
|
54 |
+
'Bulgarian': 'BG',
|
55 |
+
'Czech': 'CS',
|
56 |
+
'Danish': 'DA',
|
57 |
+
'German': 'DE',
|
58 |
+
'Greek': 'EL',
|
59 |
+
'English': 'EN',
|
60 |
+
'Spanish': 'ES',
|
61 |
+
'Estonian': 'ET',
|
62 |
+
'Finnish': 'FI',
|
63 |
+
'French': 'FR',
|
64 |
+
'Hungarian': 'HU',
|
65 |
+
'Indonesian': 'ID',
|
66 |
+
'Italian': 'IT',
|
67 |
+
'Japanese': 'JA',
|
68 |
+
'Korean': 'KO',
|
69 |
+
'Lithuanian': 'LT',
|
70 |
+
'Latvian': 'LV',
|
71 |
+
'Norwegian (Bokmål)': 'NB',
|
72 |
+
'Dutch': 'NL',
|
73 |
+
'Polish': 'PL',
|
74 |
+
'Portuguese (all Portuguese varieties mixed)': 'PT',
|
75 |
+
'Romanian': 'RO',
|
76 |
+
'Russian': 'RU',
|
77 |
+
'Slovak': 'SK',
|
78 |
+
'Slovenian': 'SL',
|
79 |
+
'Swedish': 'SV',
|
80 |
+
'Turkish': 'TR',
|
81 |
+
'Ukrainian': 'UK',
|
82 |
+
'Chinese': 'ZH'
|
83 |
+
}
|
84 |
+
|
85 |
+
|
86 |
+
class DeepLAPI:
|
87 |
+
def __init__(self,
|
88 |
+
output_dir: str = TRANSLATION_OUTPUT_DIR
|
89 |
+
):
|
90 |
+
self.api_interval = 1
|
91 |
+
self.max_text_batch_size = 50
|
92 |
+
self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
|
93 |
+
self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
|
94 |
+
self.output_dir = output_dir
|
95 |
+
|
96 |
+
def translate_deepl(self,
|
97 |
+
auth_key: str,
|
98 |
+
fileobjs: list,
|
99 |
+
source_lang: str,
|
100 |
+
target_lang: str,
|
101 |
+
is_pro: bool = False,
|
102 |
+
add_timestamp: bool = True,
|
103 |
+
progress=gr.Progress()) -> list:
|
104 |
+
"""
|
105 |
+
Translate subtitle files using DeepL API
|
106 |
+
Parameters
|
107 |
+
----------
|
108 |
+
auth_key: str
|
109 |
+
API Key for DeepL from gr.Textbox()
|
110 |
+
fileobjs: list
|
111 |
+
List of files to transcribe from gr.Files()
|
112 |
+
source_lang: str
|
113 |
+
Source language of the file to transcribe from gr.Dropdown()
|
114 |
+
target_lang: str
|
115 |
+
Target language of the file to transcribe from gr.Dropdown()
|
116 |
+
is_pro: str
|
117 |
+
Boolean value that is about pro user or not from gr.Checkbox().
|
118 |
+
add_timestamp: bool
|
119 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
120 |
+
progress: gr.Progress
|
121 |
+
Indicator to show progress directly in gradio.
|
122 |
+
|
123 |
+
Returns
|
124 |
+
----------
|
125 |
+
A List of
|
126 |
+
String to return to gr.Textbox()
|
127 |
+
Files to return to gr.Files()
|
128 |
+
"""
|
129 |
+
if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
|
130 |
+
fileobjs = [fileobj.name for fileobj in fileobjs]
|
131 |
+
|
132 |
+
self.cache_parameters(
|
133 |
+
api_key=auth_key,
|
134 |
+
is_pro=is_pro,
|
135 |
+
source_lang=source_lang,
|
136 |
+
target_lang=target_lang,
|
137 |
+
add_timestamp=add_timestamp
|
138 |
+
)
|
139 |
+
|
140 |
+
files_info = {}
|
141 |
+
for fileobj in fileobjs:
|
142 |
+
file_path = fileobj
|
143 |
+
file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
|
144 |
+
|
145 |
+
if file_ext == ".srt":
|
146 |
+
parsed_dicts = parse_srt(file_path=file_path)
|
147 |
+
|
148 |
+
elif file_ext == ".vtt":
|
149 |
+
parsed_dicts = parse_vtt(file_path=file_path)
|
150 |
+
|
151 |
+
batch_size = self.max_text_batch_size
|
152 |
+
for batch_start in range(0, len(parsed_dicts), batch_size):
|
153 |
+
batch_end = min(batch_start + batch_size, len(parsed_dicts))
|
154 |
+
sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
|
155 |
+
translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
|
156 |
+
target_lang, is_pro)
|
157 |
+
for i, translated_text in enumerate(translated_texts):
|
158 |
+
parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
|
159 |
+
progress(batch_end / len(parsed_dicts), desc="Translating..")
|
160 |
+
|
161 |
+
if file_ext == ".srt":
|
162 |
+
subtitle = get_serialized_srt(parsed_dicts)
|
163 |
+
elif file_ext == ".vtt":
|
164 |
+
subtitle = get_serialized_vtt(parsed_dicts)
|
165 |
+
|
166 |
+
if add_timestamp:
|
167 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
168 |
+
file_name += f"-{timestamp}"
|
169 |
+
|
170 |
+
output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
|
171 |
+
write_file(subtitle, output_path)
|
172 |
+
|
173 |
+
files_info[file_name] = {"subtitle": subtitle, "path": output_path}
|
174 |
+
|
175 |
+
total_result = ''
|
176 |
+
for file_name, info in files_info.items():
|
177 |
+
total_result += '------------------------------------\n'
|
178 |
+
total_result += f'{file_name}\n\n'
|
179 |
+
total_result += f'{info["subtitle"]}'
|
180 |
+
gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
|
181 |
+
|
182 |
+
output_file_paths = [item["path"] for key, item in files_info.items()]
|
183 |
+
return [gr_str, output_file_paths]
|
184 |
+
|
185 |
+
def request_deepl_translate(self,
|
186 |
+
auth_key: str,
|
187 |
+
text: list,
|
188 |
+
source_lang: str,
|
189 |
+
target_lang: str,
|
190 |
+
is_pro: bool = False):
|
191 |
+
"""Request API response to DeepL server"""
|
192 |
+
if source_lang not in list(DEEPL_AVAILABLE_SOURCE_LANGS.keys()):
|
193 |
+
raise ValueError(f"Source language {source_lang} is not supported."
|
194 |
+
f"Use one of {list(DEEPL_AVAILABLE_SOURCE_LANGS.keys())}")
|
195 |
+
if target_lang not in list(DEEPL_AVAILABLE_TARGET_LANGS.keys()):
|
196 |
+
raise ValueError(f"Target language {target_lang} is not supported."
|
197 |
+
f"Use one of {list(DEEPL_AVAILABLE_TARGET_LANGS.keys())}")
|
198 |
+
|
199 |
+
url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
|
200 |
+
headers = {
|
201 |
+
'Authorization': f'DeepL-Auth-Key {auth_key}'
|
202 |
+
}
|
203 |
+
data = {
|
204 |
+
'text': text,
|
205 |
+
'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
|
206 |
+
'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
|
207 |
+
}
|
208 |
+
response = requests.post(url, headers=headers, data=data).json()
|
209 |
+
time.sleep(self.api_interval)
|
210 |
+
return response["translations"]
|
211 |
+
|
212 |
+
@staticmethod
|
213 |
+
def cache_parameters(api_key: str,
|
214 |
+
is_pro: bool,
|
215 |
+
source_lang: str,
|
216 |
+
target_lang: str,
|
217 |
+
add_timestamp: bool):
|
218 |
+
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
219 |
+
cached_params["translation"]["deepl"] = {
|
220 |
+
"api_key": api_key,
|
221 |
+
"is_pro": is_pro,
|
222 |
+
"source_lang": source_lang,
|
223 |
+
"target_lang": target_lang
|
224 |
+
}
|
225 |
+
cached_params["translation"]["add_timestamp"] = add_timestamp
|
226 |
+
save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
|
modules/translation/nllb_inference.py
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
2 |
+
import gradio as gr
|
3 |
+
import os
|
4 |
+
|
5 |
+
from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
|
6 |
+
from modules.translation.translation_base import TranslationBase
|
7 |
+
|
8 |
+
|
9 |
+
class NLLBInference(TranslationBase):
|
10 |
+
def __init__(self,
|
11 |
+
model_dir: str = NLLB_MODELS_DIR,
|
12 |
+
output_dir: str = TRANSLATION_OUTPUT_DIR
|
13 |
+
):
|
14 |
+
super().__init__(
|
15 |
+
model_dir=model_dir,
|
16 |
+
output_dir=output_dir
|
17 |
+
)
|
18 |
+
self.tokenizer = None
|
19 |
+
self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
|
20 |
+
self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
|
21 |
+
self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
|
22 |
+
self.pipeline = None
|
23 |
+
|
24 |
+
def translate(self,
|
25 |
+
text: str,
|
26 |
+
max_length: int
|
27 |
+
):
|
28 |
+
result = self.pipeline(
|
29 |
+
text,
|
30 |
+
max_length=max_length
|
31 |
+
)
|
32 |
+
return result[0]['translation_text']
|
33 |
+
|
34 |
+
def update_model(self,
|
35 |
+
model_size: str,
|
36 |
+
src_lang: str,
|
37 |
+
tgt_lang: str,
|
38 |
+
progress: gr.Progress = gr.Progress()
|
39 |
+
):
|
40 |
+
def validate_language(lang: str) -> str:
|
41 |
+
if lang in NLLB_AVAILABLE_LANGS:
|
42 |
+
return NLLB_AVAILABLE_LANGS[lang]
|
43 |
+
elif lang not in NLLB_AVAILABLE_LANGS.values():
|
44 |
+
raise ValueError(
|
45 |
+
f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
|
46 |
+
return lang
|
47 |
+
|
48 |
+
src_lang = validate_language(src_lang)
|
49 |
+
tgt_lang = validate_language(tgt_lang)
|
50 |
+
|
51 |
+
if model_size != self.current_model_size or self.model is None:
|
52 |
+
print("\nInitializing NLLB Model..\n")
|
53 |
+
progress(0, desc="Initializing NLLB Model..")
|
54 |
+
self.current_model_size = model_size
|
55 |
+
local_files_only = self.is_model_exists(self.current_model_size)
|
56 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
|
57 |
+
cache_dir=self.model_dir,
|
58 |
+
local_files_only=local_files_only)
|
59 |
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
|
60 |
+
cache_dir=os.path.join(self.model_dir, "tokenizers"),
|
61 |
+
local_files_only=local_files_only)
|
62 |
+
|
63 |
+
self.pipeline = pipeline("translation",
|
64 |
+
model=self.model,
|
65 |
+
tokenizer=self.tokenizer,
|
66 |
+
src_lang=src_lang,
|
67 |
+
tgt_lang=tgt_lang,
|
68 |
+
device=self.device)
|
69 |
+
|
70 |
+
def is_model_exists(self,
|
71 |
+
model_size: str):
|
72 |
+
"""Check if model exists or not (Only facebook model)"""
|
73 |
+
prefix = "models--facebook--"
|
74 |
+
_id, model_size_name = model_size.split("/")
|
75 |
+
model_dir_name = prefix + model_size_name
|
76 |
+
model_dir_path = os.path.join(self.model_dir, model_dir_name)
|
77 |
+
if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
|
78 |
+
return True
|
79 |
+
return False
|
80 |
+
|
81 |
+
|
82 |
+
NLLB_AVAILABLE_LANGS = {
|
83 |
+
"Acehnese (Arabic script)": "ace_Arab",
|
84 |
+
"Acehnese (Latin script)": "ace_Latn",
|
85 |
+
"Mesopotamian Arabic": "acm_Arab",
|
86 |
+
"Ta’izzi-Adeni Arabic": "acq_Arab",
|
87 |
+
"Tunisian Arabic": "aeb_Arab",
|
88 |
+
"Afrikaans": "afr_Latn",
|
89 |
+
"South Levantine Arabic": "ajp_Arab",
|
90 |
+
"Akan": "aka_Latn",
|
91 |
+
"Amharic": "amh_Ethi",
|
92 |
+
"North Levantine Arabic": "apc_Arab",
|
93 |
+
"Modern Standard Arabic": "arb_Arab",
|
94 |
+
"Modern Standard Arabic (Romanized)": "arb_Latn",
|
95 |
+
"Najdi Arabic": "ars_Arab",
|
96 |
+
"Moroccan Arabic": "ary_Arab",
|
97 |
+
"Egyptian Arabic": "arz_Arab",
|
98 |
+
"Assamese": "asm_Beng",
|
99 |
+
"Asturian": "ast_Latn",
|
100 |
+
"Awadhi": "awa_Deva",
|
101 |
+
"Central Aymara": "ayr_Latn",
|
102 |
+
"South Azerbaijani": "azb_Arab",
|
103 |
+
"North Azerbaijani": "azj_Latn",
|
104 |
+
"Bashkir": "bak_Cyrl",
|
105 |
+
"Bambara": "bam_Latn",
|
106 |
+
"Balinese": "ban_Latn",
|
107 |
+
"Belarusian": "bel_Cyrl",
|
108 |
+
"Bemba": "bem_Latn",
|
109 |
+
"Bengali": "ben_Beng",
|
110 |
+
"Bhojpuri": "bho_Deva",
|
111 |
+
"Banjar (Arabic script)": "bjn_Arab",
|
112 |
+
"Banjar (Latin script)": "bjn_Latn",
|
113 |
+
"Standard Tibetan": "bod_Tibt",
|
114 |
+
"Bosnian": "bos_Latn",
|
115 |
+
"Buginese": "bug_Latn",
|
116 |
+
"Bulgarian": "bul_Cyrl",
|
117 |
+
"Catalan": "cat_Latn",
|
118 |
+
"Cebuano": "ceb_Latn",
|
119 |
+
"Czech": "ces_Latn",
|
120 |
+
"Chokwe": "cjk_Latn",
|
121 |
+
"Central Kurdish": "ckb_Arab",
|
122 |
+
"Crimean Tatar": "crh_Latn",
|
123 |
+
"Welsh": "cym_Latn",
|
124 |
+
"Danish": "dan_Latn",
|
125 |
+
"German": "deu_Latn",
|
126 |
+
"Southwestern Dinka": "dik_Latn",
|
127 |
+
"Dyula": "dyu_Latn",
|
128 |
+
"Dzongkha": "dzo_Tibt",
|
129 |
+
"Greek": "ell_Grek",
|
130 |
+
"English": "eng_Latn",
|
131 |
+
"Esperanto": "epo_Latn",
|
132 |
+
"Estonian": "est_Latn",
|
133 |
+
"Basque": "eus_Latn",
|
134 |
+
"Ewe": "ewe_Latn",
|
135 |
+
"Faroese": "fao_Latn",
|
136 |
+
"Fijian": "fij_Latn",
|
137 |
+
"Finnish": "fin_Latn",
|
138 |
+
"Fon": "fon_Latn",
|
139 |
+
"French": "fra_Latn",
|
140 |
+
"Friulian": "fur_Latn",
|
141 |
+
"Nigerian Fulfulde": "fuv_Latn",
|
142 |
+
"Scottish Gaelic": "gla_Latn",
|
143 |
+
"Irish": "gle_Latn",
|
144 |
+
"Galician": "glg_Latn",
|
145 |
+
"Guarani": "grn_Latn",
|
146 |
+
"Gujarati": "guj_Gujr",
|
147 |
+
"Haitian Creole": "hat_Latn",
|
148 |
+
"Hausa": "hau_Latn",
|
149 |
+
"Hebrew": "heb_Hebr",
|
150 |
+
"Hindi": "hin_Deva",
|
151 |
+
"Chhattisgarhi": "hne_Deva",
|
152 |
+
"Croatian": "hrv_Latn",
|
153 |
+
"Hungarian": "hun_Latn",
|
154 |
+
"Armenian": "hye_Armn",
|
155 |
+
"Igbo": "ibo_Latn",
|
156 |
+
"Ilocano": "ilo_Latn",
|
157 |
+
"Indonesian": "ind_Latn",
|
158 |
+
"Icelandic": "isl_Latn",
|
159 |
+
"Italian": "ita_Latn",
|
160 |
+
"Javanese": "jav_Latn",
|
161 |
+
"Japanese": "jpn_Jpan",
|
162 |
+
"Kabyle": "kab_Latn",
|
163 |
+
"Jingpho": "kac_Latn",
|
164 |
+
"Kamba": "kam_Latn",
|
165 |
+
"Kannada": "kan_Knda",
|
166 |
+
"Kashmiri (Arabic script)": "kas_Arab",
|
167 |
+
"Kashmiri (Devanagari script)": "kas_Deva",
|
168 |
+
"Georgian": "kat_Geor",
|
169 |
+
"Central Kanuri (Arabic script)": "knc_Arab",
|
170 |
+
"Central Kanuri (Latin script)": "knc_Latn",
|
171 |
+
"Kazakh": "kaz_Cyrl",
|
172 |
+
"Kabiyè": "kbp_Latn",
|
173 |
+
"Kabuverdianu": "kea_Latn",
|
174 |
+
"Khmer": "khm_Khmr",
|
175 |
+
"Kikuyu": "kik_Latn",
|
176 |
+
"Kinyarwanda": "kin_Latn",
|
177 |
+
"Kyrgyz": "kir_Cyrl",
|
178 |
+
"Kimbundu": "kmb_Latn",
|
179 |
+
"Northern Kurdish": "kmr_Latn",
|
180 |
+
"Kikongo": "kon_Latn",
|
181 |
+
"Korean": "kor_Hang",
|
182 |
+
"Lao": "lao_Laoo",
|
183 |
+
"Ligurian": "lij_Latn",
|
184 |
+
"Limburgish": "lim_Latn",
|
185 |
+
"Lingala": "lin_Latn",
|
186 |
+
"Lithuanian": "lit_Latn",
|
187 |
+
"Lombard": "lmo_Latn",
|
188 |
+
"Latgalian": "ltg_Latn",
|
189 |
+
"Luxembourgish": "ltz_Latn",
|
190 |
+
"Luba-Kasai": "lua_Latn",
|
191 |
+
"Ganda": "lug_Latn",
|
192 |
+
"Luo": "luo_Latn",
|
193 |
+
"Mizo": "lus_Latn",
|
194 |
+
"Standard Latvian": "lvs_Latn",
|
195 |
+
"Magahi": "mag_Deva",
|
196 |
+
"Maithili": "mai_Deva",
|
197 |
+
"Malayalam": "mal_Mlym",
|
198 |
+
"Marathi": "mar_Deva",
|
199 |
+
"Minangkabau (Arabic script)": "min_Arab",
|
200 |
+
"Minangkabau (Latin script)": "min_Latn",
|
201 |
+
"Macedonian": "mkd_Cyrl",
|
202 |
+
"Plateau Malagasy": "plt_Latn",
|
203 |
+
"Maltese": "mlt_Latn",
|
204 |
+
"Meitei (Bengali script)": "mni_Beng",
|
205 |
+
"Halh Mongolian": "khk_Cyrl",
|
206 |
+
"Mossi": "mos_Latn",
|
207 |
+
"Maori": "mri_Latn",
|
208 |
+
"Burmese": "mya_Mymr",
|
209 |
+
"Dutch": "nld_Latn",
|
210 |
+
"Norwegian Nynorsk": "nno_Latn",
|
211 |
+
"Norwegian Bokmål": "nob_Latn",
|
212 |
+
"Nepali": "npi_Deva",
|
213 |
+
"Northern Sotho": "nso_Latn",
|
214 |
+
"Nuer": "nus_Latn",
|
215 |
+
"Nyanja": "nya_Latn",
|
216 |
+
"Occitan": "oci_Latn",
|
217 |
+
"West Central Oromo": "gaz_Latn",
|
218 |
+
"Odia": "ory_Orya",
|
219 |
+
"Pangasinan": "pag_Latn",
|
220 |
+
"Eastern Panjabi": "pan_Guru",
|
221 |
+
"Papiamento": "pap_Latn",
|
222 |
+
"Western Persian": "pes_Arab",
|
223 |
+
"Polish": "pol_Latn",
|
224 |
+
"Portuguese": "por_Latn",
|
225 |
+
"Dari": "prs_Arab",
|
226 |
+
"Southern Pashto": "pbt_Arab",
|
227 |
+
"Ayacucho Quechua": "quy_Latn",
|
228 |
+
"Romanian": "ron_Latn",
|
229 |
+
"Rundi": "run_Latn",
|
230 |
+
"Russian": "rus_Cyrl",
|
231 |
+
"Sango": "sag_Latn",
|
232 |
+
"Sanskrit": "san_Deva",
|
233 |
+
"Santali": "sat_Olck",
|
234 |
+
"Sicilian": "scn_Latn",
|
235 |
+
"Shan": "shn_Mymr",
|
236 |
+
"Sinhala": "sin_Sinh",
|
237 |
+
"Slovak": "slk_Latn",
|
238 |
+
"Slovenian": "slv_Latn",
|
239 |
+
"Samoan": "smo_Latn",
|
240 |
+
"Shona": "sna_Latn",
|
241 |
+
"Sindhi": "snd_Arab",
|
242 |
+
"Somali": "som_Latn",
|
243 |
+
"Southern Sotho": "sot_Latn",
|
244 |
+
"Spanish": "spa_Latn",
|
245 |
+
"Tosk Albanian": "als_Latn",
|
246 |
+
"Sardinian": "srd_Latn",
|
247 |
+
"Serbian": "srp_Cyrl",
|
248 |
+
"Swati": "ssw_Latn",
|
249 |
+
"Sundanese": "sun_Latn",
|
250 |
+
"Swedish": "swe_Latn",
|
251 |
+
"Swahili": "swh_Latn",
|
252 |
+
"Silesian": "szl_Latn",
|
253 |
+
"Tamil": "tam_Taml",
|
254 |
+
"Tatar": "tat_Cyrl",
|
255 |
+
"Telugu": "tel_Telu",
|
256 |
+
"Tajik": "tgk_Cyrl",
|
257 |
+
"Tagalog": "tgl_Latn",
|
258 |
+
"Thai": "tha_Thai",
|
259 |
+
"Tigrinya": "tir_Ethi",
|
260 |
+
"Tamasheq (Latin script)": "taq_Latn",
|
261 |
+
"Tamasheq (Tifinagh script)": "taq_Tfng",
|
262 |
+
"Tok Pisin": "tpi_Latn",
|
263 |
+
"Tswana": "tsn_Latn",
|
264 |
+
"Tsonga": "tso_Latn",
|
265 |
+
"Turkmen": "tuk_Latn",
|
266 |
+
"Tumbuka": "tum_Latn",
|
267 |
+
"Turkish": "tur_Latn",
|
268 |
+
"Twi": "twi_Latn",
|
269 |
+
"Central Atlas Tamazight": "tzm_Tfng",
|
270 |
+
"Uyghur": "uig_Arab",
|
271 |
+
"Ukrainian": "ukr_Cyrl",
|
272 |
+
"Umbundu": "umb_Latn",
|
273 |
+
"Urdu": "urd_Arab",
|
274 |
+
"Northern Uzbek": "uzn_Latn",
|
275 |
+
"Venetian": "vec_Latn",
|
276 |
+
"Vietnamese": "vie_Latn",
|
277 |
+
"Waray": "war_Latn",
|
278 |
+
"Wolof": "wol_Latn",
|
279 |
+
"Xhosa": "xho_Latn",
|
280 |
+
"Eastern Yiddish": "ydd_Hebr",
|
281 |
+
"Yoruba": "yor_Latn",
|
282 |
+
"Yue Chinese": "yue_Hant",
|
283 |
+
"Chinese (Simplified)": "zho_Hans",
|
284 |
+
"Chinese (Traditional)": "zho_Hant",
|
285 |
+
"Standard Malay": "zsm_Latn",
|
286 |
+
"Zulu": "zul_Latn",
|
287 |
+
}
|
modules/translation/translation_base.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import gradio as gr
|
4 |
+
from abc import ABC, abstractmethod
|
5 |
+
from typing import List
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
from modules.whisper.whisper_parameter import *
|
9 |
+
from modules.utils.subtitle_manager import *
|
10 |
+
from modules.utils.files_manager import load_yaml, save_yaml
|
11 |
+
from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
|
12 |
+
|
13 |
+
|
14 |
+
class TranslationBase(ABC):
|
15 |
+
def __init__(self,
|
16 |
+
model_dir: str = NLLB_MODELS_DIR,
|
17 |
+
output_dir: str = TRANSLATION_OUTPUT_DIR
|
18 |
+
):
|
19 |
+
super().__init__()
|
20 |
+
self.model = None
|
21 |
+
self.model_dir = model_dir
|
22 |
+
self.output_dir = output_dir
|
23 |
+
os.makedirs(self.model_dir, exist_ok=True)
|
24 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
25 |
+
self.current_model_size = None
|
26 |
+
self.device = self.get_device()
|
27 |
+
|
28 |
+
@abstractmethod
|
29 |
+
def translate(self,
|
30 |
+
text: str,
|
31 |
+
max_length: int
|
32 |
+
):
|
33 |
+
pass
|
34 |
+
|
35 |
+
@abstractmethod
|
36 |
+
def update_model(self,
|
37 |
+
model_size: str,
|
38 |
+
src_lang: str,
|
39 |
+
tgt_lang: str,
|
40 |
+
progress: gr.Progress = gr.Progress()
|
41 |
+
):
|
42 |
+
pass
|
43 |
+
|
44 |
+
def translate_file(self,
|
45 |
+
fileobjs: list,
|
46 |
+
model_size: str,
|
47 |
+
src_lang: str,
|
48 |
+
tgt_lang: str,
|
49 |
+
max_length: int = 200,
|
50 |
+
add_timestamp: bool = True,
|
51 |
+
progress=gr.Progress()) -> list:
|
52 |
+
"""
|
53 |
+
Translate subtitle file from source language to target language
|
54 |
+
|
55 |
+
Parameters
|
56 |
+
----------
|
57 |
+
fileobjs: list
|
58 |
+
List of files to transcribe from gr.Files()
|
59 |
+
model_size: str
|
60 |
+
Whisper model size from gr.Dropdown()
|
61 |
+
src_lang: str
|
62 |
+
Source language of the file to translate from gr.Dropdown()
|
63 |
+
tgt_lang: str
|
64 |
+
Target language of the file to translate from gr.Dropdown()
|
65 |
+
max_length: int
|
66 |
+
Max length per line to translate
|
67 |
+
add_timestamp: bool
|
68 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
69 |
+
progress: gr.Progress
|
70 |
+
Indicator to show progress directly in gradio.
|
71 |
+
I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
|
72 |
+
|
73 |
+
Returns
|
74 |
+
----------
|
75 |
+
A List of
|
76 |
+
String to return to gr.Textbox()
|
77 |
+
Files to return to gr.Files()
|
78 |
+
"""
|
79 |
+
try:
|
80 |
+
if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
|
81 |
+
fileobjs = [file.name for file in fileobjs]
|
82 |
+
|
83 |
+
self.cache_parameters(model_size=model_size,
|
84 |
+
src_lang=src_lang,
|
85 |
+
tgt_lang=tgt_lang,
|
86 |
+
max_length=max_length,
|
87 |
+
add_timestamp=add_timestamp)
|
88 |
+
|
89 |
+
self.update_model(model_size=model_size,
|
90 |
+
src_lang=src_lang,
|
91 |
+
tgt_lang=tgt_lang,
|
92 |
+
progress=progress)
|
93 |
+
|
94 |
+
files_info = {}
|
95 |
+
for fileobj in fileobjs:
|
96 |
+
file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
|
97 |
+
if file_ext == ".srt":
|
98 |
+
parsed_dicts = parse_srt(file_path=fileobj)
|
99 |
+
total_progress = len(parsed_dicts)
|
100 |
+
for index, dic in enumerate(parsed_dicts):
|
101 |
+
progress(index / total_progress, desc="Translating..")
|
102 |
+
translated_text = self.translate(dic["sentence"], max_length=max_length)
|
103 |
+
dic["sentence"] = translated_text
|
104 |
+
subtitle = get_serialized_srt(parsed_dicts)
|
105 |
+
|
106 |
+
elif file_ext == ".vtt":
|
107 |
+
parsed_dicts = parse_vtt(file_path=fileobj)
|
108 |
+
total_progress = len(parsed_dicts)
|
109 |
+
for index, dic in enumerate(parsed_dicts):
|
110 |
+
progress(index / total_progress, desc="Translating..")
|
111 |
+
translated_text = self.translate(dic["sentence"], max_length=max_length)
|
112 |
+
dic["sentence"] = translated_text
|
113 |
+
subtitle = get_serialized_vtt(parsed_dicts)
|
114 |
+
|
115 |
+
if add_timestamp:
|
116 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
117 |
+
file_name += f"-{timestamp}"
|
118 |
+
|
119 |
+
output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
|
120 |
+
write_file(subtitle, output_path)
|
121 |
+
|
122 |
+
files_info[file_name] = {"subtitle": subtitle, "path": output_path}
|
123 |
+
|
124 |
+
total_result = ''
|
125 |
+
for file_name, info in files_info.items():
|
126 |
+
total_result += '------------------------------------\n'
|
127 |
+
total_result += f'{file_name}\n\n'
|
128 |
+
total_result += f'{info["subtitle"]}'
|
129 |
+
gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
|
130 |
+
|
131 |
+
output_file_paths = [item["path"] for key, item in files_info.items()]
|
132 |
+
return [gr_str, output_file_paths]
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
print(f"Error: {str(e)}")
|
136 |
+
finally:
|
137 |
+
self.release_cuda_memory()
|
138 |
+
|
139 |
+
@staticmethod
|
140 |
+
def get_device():
|
141 |
+
if torch.cuda.is_available():
|
142 |
+
return "cuda"
|
143 |
+
elif torch.backends.mps.is_available():
|
144 |
+
return "mps"
|
145 |
+
else:
|
146 |
+
return "cpu"
|
147 |
+
|
148 |
+
@staticmethod
|
149 |
+
def release_cuda_memory():
|
150 |
+
if torch.cuda.is_available():
|
151 |
+
torch.cuda.empty_cache()
|
152 |
+
torch.cuda.reset_max_memory_allocated()
|
153 |
+
|
154 |
+
@staticmethod
|
155 |
+
def remove_input_files(file_paths: List[str]):
|
156 |
+
if not file_paths:
|
157 |
+
return
|
158 |
+
|
159 |
+
for file_path in file_paths:
|
160 |
+
if file_path and os.path.exists(file_path):
|
161 |
+
os.remove(file_path)
|
162 |
+
|
163 |
+
@staticmethod
|
164 |
+
def cache_parameters(model_size: str,
|
165 |
+
src_lang: str,
|
166 |
+
tgt_lang: str,
|
167 |
+
max_length: int,
|
168 |
+
add_timestamp: bool):
|
169 |
+
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
170 |
+
cached_params["translation"]["nllb"] = {
|
171 |
+
"model_size": model_size,
|
172 |
+
"source_lang": src_lang,
|
173 |
+
"target_lang": tgt_lang,
|
174 |
+
"max_length": max_length,
|
175 |
+
}
|
176 |
+
cached_params["translation"]["add_timestamp"] = add_timestamp
|
177 |
+
save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
|
modules/ui/__init__.py
ADDED
File without changes
|
modules/ui/htmls.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CSS = """
|
2 |
+
.bmc-button {
|
3 |
+
padding: 2px 5px;
|
4 |
+
border-radius: 5px;
|
5 |
+
background-color: #FF813F;
|
6 |
+
color: white;
|
7 |
+
box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
|
8 |
+
text-decoration: none;
|
9 |
+
display: inline-block;
|
10 |
+
font-size: 20px;
|
11 |
+
margin: 2px;
|
12 |
+
cursor: pointer;
|
13 |
+
-webkit-transition: background-color 0.3s ease;
|
14 |
+
-ms-transition: background-color 0.3s ease;
|
15 |
+
transition: background-color 0.3s ease;
|
16 |
+
}
|
17 |
+
.bmc-button:hover,
|
18 |
+
.bmc-button:active,
|
19 |
+
.bmc-button:focus {
|
20 |
+
background-color: #FF5633;
|
21 |
+
}
|
22 |
+
.markdown {
|
23 |
+
margin-bottom: 0;
|
24 |
+
padding-bottom: 0;
|
25 |
+
}
|
26 |
+
.tabs {
|
27 |
+
margin-top: 0;
|
28 |
+
padding-top: 0;
|
29 |
+
}
|
30 |
+
|
31 |
+
#md_project a {
|
32 |
+
color: black;
|
33 |
+
text-decoration: none;
|
34 |
+
}
|
35 |
+
#md_project a:hover {
|
36 |
+
text-decoration: underline;
|
37 |
+
}
|
38 |
+
"""
|
39 |
+
|
40 |
+
MARKDOWN = """
|
41 |
+
# Automatic speech recognition
|
42 |
+
"""
|
43 |
+
|
44 |
+
|
45 |
+
NLLB_VRAM_TABLE = """
|
46 |
+
<!DOCTYPE html>
|
47 |
+
<html lang="en">
|
48 |
+
<head>
|
49 |
+
<meta charset="UTF-8">
|
50 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
51 |
+
<style>
|
52 |
+
table {
|
53 |
+
border-collapse: collapse;
|
54 |
+
width: 100%;
|
55 |
+
}
|
56 |
+
th, td {
|
57 |
+
border: 1px solid #dddddd;
|
58 |
+
text-align: left;
|
59 |
+
padding: 8px;
|
60 |
+
}
|
61 |
+
th {
|
62 |
+
background-color: #f2f2f2;
|
63 |
+
}
|
64 |
+
</style>
|
65 |
+
</head>
|
66 |
+
<body>
|
67 |
+
|
68 |
+
<details>
|
69 |
+
<summary>VRAM usage for each model</summary>
|
70 |
+
<table>
|
71 |
+
<thead>
|
72 |
+
<tr>
|
73 |
+
<th>Model name</th>
|
74 |
+
<th>Required VRAM</th>
|
75 |
+
</tr>
|
76 |
+
</thead>
|
77 |
+
<tbody>
|
78 |
+
<tr>
|
79 |
+
<td>nllb-200-3.3B</td>
|
80 |
+
<td>~16GB</td>
|
81 |
+
</tr>
|
82 |
+
<tr>
|
83 |
+
<td>nllb-200-1.3B</td>
|
84 |
+
<td>~8GB</td>
|
85 |
+
</tr>
|
86 |
+
<tr>
|
87 |
+
<td>nllb-200-distilled-600M</td>
|
88 |
+
<td>~4GB</td>
|
89 |
+
</tr>
|
90 |
+
</tbody>
|
91 |
+
</table>
|
92 |
+
<p><strong>Note:</strong> Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.</p>
|
93 |
+
</details>
|
94 |
+
|
95 |
+
</body>
|
96 |
+
</html>
|
97 |
+
"""
|
modules/utils/__init__.py
ADDED
File without changes
|
modules/utils/cli_manager.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
|
4 |
+
def str2bool(v):
|
5 |
+
if isinstance(v, bool):
|
6 |
+
return v
|
7 |
+
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
8 |
+
return True
|
9 |
+
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
10 |
+
return False
|
11 |
+
else:
|
12 |
+
raise argparse.ArgumentTypeError('Boolean value expected.')
|
modules/utils/files_manager.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import fnmatch
|
3 |
+
from ruamel.yaml import YAML
|
4 |
+
from gradio.utils import NamedString
|
5 |
+
|
6 |
+
from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
|
7 |
+
|
8 |
+
|
9 |
+
def load_yaml(path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
|
10 |
+
yaml = YAML(typ="safe")
|
11 |
+
yaml.preserve_quotes = True
|
12 |
+
with open(path, 'r', encoding='utf-8') as file:
|
13 |
+
config = yaml.load(file)
|
14 |
+
return config
|
15 |
+
|
16 |
+
|
17 |
+
def save_yaml(data: dict, path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
|
18 |
+
yaml = YAML(typ="safe")
|
19 |
+
yaml.map_indent = 2
|
20 |
+
yaml.sequence_indent = 4
|
21 |
+
yaml.sequence_dash_offset = 2
|
22 |
+
yaml.preserve_quotes = True
|
23 |
+
yaml.default_flow_style = False
|
24 |
+
yaml.sort_base_mapping_type_on_output = False
|
25 |
+
|
26 |
+
with open(path, 'w', encoding='utf-8') as file:
|
27 |
+
yaml.dump(data, file)
|
28 |
+
return path
|
29 |
+
|
30 |
+
|
31 |
+
def get_media_files(folder_path, include_sub_directory=False):
|
32 |
+
video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv', '*.webm', '*.m4v', '*.mpeg', '*.mpg',
|
33 |
+
'*.3gp', '*.f4v', '*.ogv', '*.vob', '*.mts', '*.m2ts', '*.divx', '*.mxf', '*.rm', '*.rmvb']
|
34 |
+
audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
|
35 |
+
media_extensions = video_extensions + audio_extensions
|
36 |
+
|
37 |
+
media_files = []
|
38 |
+
|
39 |
+
if include_sub_directory:
|
40 |
+
for root, _, files in os.walk(folder_path):
|
41 |
+
for extension in media_extensions:
|
42 |
+
media_files.extend(
|
43 |
+
os.path.join(root, file) for file in fnmatch.filter(files, extension)
|
44 |
+
if os.path.exists(os.path.join(root, file))
|
45 |
+
)
|
46 |
+
else:
|
47 |
+
for extension in media_extensions:
|
48 |
+
media_files.extend(
|
49 |
+
os.path.join(folder_path, file) for file in fnmatch.filter(os.listdir(folder_path), extension)
|
50 |
+
if os.path.isfile(os.path.join(folder_path, file)) and os.path.exists(os.path.join(folder_path, file))
|
51 |
+
)
|
52 |
+
|
53 |
+
return media_files
|
54 |
+
|
55 |
+
|
56 |
+
def format_gradio_files(files: list):
|
57 |
+
if not files:
|
58 |
+
return files
|
59 |
+
|
60 |
+
gradio_files = []
|
61 |
+
for file in files:
|
62 |
+
gradio_files.append(NamedString(file))
|
63 |
+
return gradio_files
|
64 |
+
|
65 |
+
|
66 |
+
def is_video(file_path):
|
67 |
+
video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
|
68 |
+
extension = os.path.splitext(file_path)[1].lower()
|
69 |
+
return extension in video_extensions
|
modules/utils/paths.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
WEBUI_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
4 |
+
MODELS_DIR = os.path.join(WEBUI_DIR, "models")
|
5 |
+
WHISPER_MODELS_DIR = os.path.join(MODELS_DIR, "Whisper")
|
6 |
+
FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
|
7 |
+
INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
|
8 |
+
NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
|
9 |
+
DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
|
10 |
+
UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
|
11 |
+
CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
|
12 |
+
DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
|
13 |
+
OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
|
14 |
+
TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
|
15 |
+
UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
|
16 |
+
UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
|
17 |
+
UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
|
18 |
+
|
19 |
+
for dir_path in [MODELS_DIR,
|
20 |
+
WHISPER_MODELS_DIR,
|
21 |
+
FASTER_WHISPER_MODELS_DIR,
|
22 |
+
INSANELY_FAST_WHISPER_MODELS_DIR,
|
23 |
+
NLLB_MODELS_DIR,
|
24 |
+
DIARIZATION_MODELS_DIR,
|
25 |
+
UVR_MODELS_DIR,
|
26 |
+
CONFIGS_DIR,
|
27 |
+
OUTPUT_DIR,
|
28 |
+
TRANSLATION_OUTPUT_DIR,
|
29 |
+
UVR_INSTRUMENTAL_OUTPUT_DIR,
|
30 |
+
UVR_VOCALS_OUTPUT_DIR]:
|
31 |
+
os.makedirs(dir_path, exist_ok=True)
|
modules/utils/subtitle_manager.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
def timeformat_srt(time):
|
5 |
+
hours = time // 3600
|
6 |
+
minutes = (time - hours * 3600) // 60
|
7 |
+
seconds = time - hours * 3600 - minutes * 60
|
8 |
+
milliseconds = (time - int(time)) * 1000
|
9 |
+
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
|
10 |
+
|
11 |
+
|
12 |
+
def timeformat_vtt(time):
|
13 |
+
hours = time // 3600
|
14 |
+
minutes = (time - hours * 3600) // 60
|
15 |
+
seconds = time - hours * 3600 - minutes * 60
|
16 |
+
milliseconds = (time - int(time)) * 1000
|
17 |
+
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
|
18 |
+
|
19 |
+
|
20 |
+
def write_file(subtitle, output_file):
|
21 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
22 |
+
f.write(subtitle)
|
23 |
+
|
24 |
+
|
25 |
+
def get_srt(segments):
|
26 |
+
output = ""
|
27 |
+
for i, segment in enumerate(segments):
|
28 |
+
output += f"{i + 1}\n"
|
29 |
+
output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
|
30 |
+
if segment['text'].startswith(' '):
|
31 |
+
segment['text'] = segment['text'][1:]
|
32 |
+
output += f"{segment['text']}\n\n"
|
33 |
+
return output
|
34 |
+
|
35 |
+
|
36 |
+
def get_vtt(segments):
|
37 |
+
output = "WebVTT\n\n"
|
38 |
+
for i, segment in enumerate(segments):
|
39 |
+
output += f"{i + 1}\n"
|
40 |
+
output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
|
41 |
+
if segment['text'].startswith(' '):
|
42 |
+
segment['text'] = segment['text'][1:]
|
43 |
+
output += f"{segment['text']}\n\n"
|
44 |
+
return output
|
45 |
+
|
46 |
+
|
47 |
+
def get_txt(segments):
|
48 |
+
output = ""
|
49 |
+
for i, segment in enumerate(segments):
|
50 |
+
if segment['text'].startswith(' '):
|
51 |
+
segment['text'] = segment['text'][1:]
|
52 |
+
output += f"{segment['text']}\n"
|
53 |
+
return output
|
54 |
+
|
55 |
+
|
56 |
+
def parse_srt(file_path):
|
57 |
+
"""Reads SRT file and returns as dict"""
|
58 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
59 |
+
srt_data = file.read()
|
60 |
+
|
61 |
+
data = []
|
62 |
+
blocks = srt_data.split('\n\n')
|
63 |
+
|
64 |
+
for block in blocks:
|
65 |
+
if block.strip() != '':
|
66 |
+
lines = block.strip().split('\n')
|
67 |
+
index = lines[0]
|
68 |
+
timestamp = lines[1]
|
69 |
+
sentence = ' '.join(lines[2:])
|
70 |
+
|
71 |
+
data.append({
|
72 |
+
"index": index,
|
73 |
+
"timestamp": timestamp,
|
74 |
+
"sentence": sentence
|
75 |
+
})
|
76 |
+
return data
|
77 |
+
|
78 |
+
|
79 |
+
def parse_vtt(file_path):
|
80 |
+
"""Reads WebVTT file and returns as dict"""
|
81 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
82 |
+
webvtt_data = file.read()
|
83 |
+
|
84 |
+
data = []
|
85 |
+
blocks = webvtt_data.split('\n\n')
|
86 |
+
|
87 |
+
for block in blocks:
|
88 |
+
if block.strip() != '' and not block.strip().startswith("WebVTT"):
|
89 |
+
lines = block.strip().split('\n')
|
90 |
+
index = lines[0]
|
91 |
+
timestamp = lines[1]
|
92 |
+
sentence = ' '.join(lines[2:])
|
93 |
+
|
94 |
+
data.append({
|
95 |
+
"index": index,
|
96 |
+
"timestamp": timestamp,
|
97 |
+
"sentence": sentence
|
98 |
+
})
|
99 |
+
|
100 |
+
return data
|
101 |
+
|
102 |
+
|
103 |
+
def get_serialized_srt(dicts):
|
104 |
+
output = ""
|
105 |
+
for dic in dicts:
|
106 |
+
output += f'{dic["index"]}\n'
|
107 |
+
output += f'{dic["timestamp"]}\n'
|
108 |
+
output += f'{dic["sentence"]}\n\n'
|
109 |
+
return output
|
110 |
+
|
111 |
+
|
112 |
+
def get_serialized_vtt(dicts):
|
113 |
+
output = "WebVTT\n\n"
|
114 |
+
for dic in dicts:
|
115 |
+
output += f'{dic["index"]}\n'
|
116 |
+
output += f'{dic["timestamp"]}\n'
|
117 |
+
output += f'{dic["sentence"]}\n\n'
|
118 |
+
return output
|
119 |
+
|
120 |
+
|
121 |
+
def safe_filename(name):
|
122 |
+
INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
|
123 |
+
safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
|
124 |
+
# Truncate the filename if it exceeds the max_length (20)
|
125 |
+
if len(safe_name) > 20:
|
126 |
+
file_extension = safe_name.split('.')[-1]
|
127 |
+
if len(file_extension) + 1 < 20:
|
128 |
+
truncated_name = safe_name[:20 - len(file_extension) - 1]
|
129 |
+
safe_name = truncated_name + '.' + file_extension
|
130 |
+
else:
|
131 |
+
safe_name = safe_name[:20]
|
132 |
+
return safe_name
|
modules/utils/youtube_manager.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pytubefix import YouTube
|
2 |
+
import subprocess
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
def get_ytdata(link):
|
7 |
+
return YouTube(link)
|
8 |
+
|
9 |
+
|
10 |
+
def get_ytmetas(link):
|
11 |
+
yt = YouTube(link)
|
12 |
+
return yt.thumbnail_url, yt.title, yt.description
|
13 |
+
|
14 |
+
|
15 |
+
def get_ytaudio(ytdata: YouTube):
|
16 |
+
# Somehow the audio is corrupted so need to convert to valid audio file.
|
17 |
+
# Fix for : https://github.com/jhj0517/Whisper-WebUI/issues/304
|
18 |
+
|
19 |
+
audio_path = ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))
|
20 |
+
temp_audio_path = os.path.join("modules", "yt_tmp_fixed.wav")
|
21 |
+
|
22 |
+
try:
|
23 |
+
subprocess.run([
|
24 |
+
'ffmpeg', '-y',
|
25 |
+
'-i', audio_path,
|
26 |
+
temp_audio_path
|
27 |
+
], check=True)
|
28 |
+
|
29 |
+
os.replace(temp_audio_path, audio_path)
|
30 |
+
return audio_path
|
31 |
+
except subprocess.CalledProcessError as e:
|
32 |
+
print(f"Error during ffmpeg conversion: {e}")
|
33 |
+
return None
|
modules/uvr/music_separator.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Union, List, Dict
|
2 |
+
import numpy as np
|
3 |
+
import torchaudio
|
4 |
+
import soundfile as sf
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
+
import gc
|
8 |
+
import gradio as gr
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
from uvr.models import MDX, Demucs, VrNetwork, MDXC
|
12 |
+
from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
|
13 |
+
from modules.utils.files_manager import load_yaml, save_yaml, is_video
|
14 |
+
from modules.diarize.audio_loader import load_audio
|
15 |
+
|
16 |
+
class MusicSeparator:
|
17 |
+
def __init__(self,
|
18 |
+
model_dir: Optional[str] = None,
|
19 |
+
output_dir: Optional[str] = None):
|
20 |
+
self.model = None
|
21 |
+
self.device = self.get_device()
|
22 |
+
self.available_devices = ["cpu", "cuda"]
|
23 |
+
self.model_dir = model_dir
|
24 |
+
self.output_dir = output_dir
|
25 |
+
instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
|
26 |
+
vocals_output_dir = os.path.join(self.output_dir, "vocals")
|
27 |
+
os.makedirs(instrumental_output_dir, exist_ok=True)
|
28 |
+
os.makedirs(vocals_output_dir, exist_ok=True)
|
29 |
+
self.audio_info = None
|
30 |
+
self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
|
31 |
+
self.default_model = self.available_models[0]
|
32 |
+
self.current_model_size = self.default_model
|
33 |
+
self.model_config = {
|
34 |
+
"segment": 256,
|
35 |
+
"split": True
|
36 |
+
}
|
37 |
+
|
38 |
+
def update_model(self,
|
39 |
+
model_name: str = "UVR-MDX-NET-Inst_1",
|
40 |
+
device: Optional[str] = None,
|
41 |
+
segment_size: int = 256):
|
42 |
+
"""
|
43 |
+
Update model with the given model name
|
44 |
+
|
45 |
+
Args:
|
46 |
+
model_name (str): Model name.
|
47 |
+
device (str): Device to use for the model.
|
48 |
+
segment_size (int): Segment size for the prediction.
|
49 |
+
"""
|
50 |
+
if device is None:
|
51 |
+
device = self.device
|
52 |
+
|
53 |
+
self.device = device
|
54 |
+
self.model_config = {
|
55 |
+
"segment": segment_size,
|
56 |
+
"split": True
|
57 |
+
}
|
58 |
+
self.model = MDX(name=model_name,
|
59 |
+
other_metadata=self.model_config,
|
60 |
+
device=self.device,
|
61 |
+
logger=None,
|
62 |
+
model_dir=self.model_dir)
|
63 |
+
|
64 |
+
def separate(self,
|
65 |
+
audio: Union[str, np.ndarray],
|
66 |
+
model_name: str,
|
67 |
+
device: Optional[str] = None,
|
68 |
+
segment_size: int = 256,
|
69 |
+
save_file: bool = False,
|
70 |
+
progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
|
71 |
+
"""
|
72 |
+
Separate the background music from the audio.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
audio (Union[str, np.ndarray]): Audio path or numpy array.
|
76 |
+
model_name (str): Model name.
|
77 |
+
device (str): Device to use for the model.
|
78 |
+
segment_size (int): Segment size for the prediction.
|
79 |
+
save_file (bool): Whether to save the separated audio to output path or not.
|
80 |
+
progress (gr.Progress): Gradio progress indicator.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
A Tuple of
|
84 |
+
np.ndarray: Instrumental numpy arrays.
|
85 |
+
np.ndarray: Vocals numpy arrays.
|
86 |
+
file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
|
87 |
+
"""
|
88 |
+
if isinstance(audio, str):
|
89 |
+
output_filename, ext = os.path.basename(audio), ".wav"
|
90 |
+
output_filename, orig_ext = os.path.splitext(output_filename)
|
91 |
+
|
92 |
+
if is_video(audio):
|
93 |
+
audio = load_audio(audio)
|
94 |
+
sample_rate = 16000
|
95 |
+
else:
|
96 |
+
self.audio_info = torchaudio.info(audio)
|
97 |
+
sample_rate = self.audio_info.sample_rate
|
98 |
+
else:
|
99 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
100 |
+
output_filename, ext = f"UVR-{timestamp}", ".wav"
|
101 |
+
sample_rate = 16000
|
102 |
+
|
103 |
+
model_config = {
|
104 |
+
"segment": segment_size,
|
105 |
+
"split": True
|
106 |
+
}
|
107 |
+
|
108 |
+
if (self.model is None or
|
109 |
+
self.current_model_size != model_name or
|
110 |
+
self.model_config != model_config or
|
111 |
+
self.model.sample_rate != sample_rate or
|
112 |
+
self.device != device):
|
113 |
+
progress(0, desc="Initializing UVR Model..")
|
114 |
+
self.update_model(
|
115 |
+
model_name=model_name,
|
116 |
+
device=device,
|
117 |
+
segment_size=segment_size
|
118 |
+
)
|
119 |
+
self.model.sample_rate = sample_rate
|
120 |
+
|
121 |
+
progress(0, desc="Separating background music from the audio..")
|
122 |
+
result = self.model(audio)
|
123 |
+
instrumental, vocals = result["instrumental"].T, result["vocals"].T
|
124 |
+
|
125 |
+
file_paths = []
|
126 |
+
if save_file:
|
127 |
+
instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
|
128 |
+
vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
|
129 |
+
sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
|
130 |
+
sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
|
131 |
+
file_paths += [instrumental_output_path, vocals_output_path]
|
132 |
+
|
133 |
+
return instrumental, vocals, file_paths
|
134 |
+
|
135 |
+
def separate_files(self,
|
136 |
+
files: List,
|
137 |
+
model_name: str,
|
138 |
+
device: Optional[str] = None,
|
139 |
+
segment_size: int = 256,
|
140 |
+
save_file: bool = True,
|
141 |
+
progress: gr.Progress = gr.Progress()) -> List[str]:
|
142 |
+
"""Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
|
143 |
+
to display into gr.Audio()"""
|
144 |
+
self.cache_parameters(model_size=model_name, segment_size=segment_size)
|
145 |
+
|
146 |
+
for file_path in files:
|
147 |
+
instrumental, vocals, file_paths = self.separate(
|
148 |
+
audio=file_path,
|
149 |
+
model_name=model_name,
|
150 |
+
device=device,
|
151 |
+
segment_size=segment_size,
|
152 |
+
save_file=save_file,
|
153 |
+
progress=progress
|
154 |
+
)
|
155 |
+
return file_paths
|
156 |
+
|
157 |
+
@staticmethod
|
158 |
+
def get_device():
|
159 |
+
"""Get device for the model"""
|
160 |
+
return "cuda" if torch.cuda.is_available() else "cpu"
|
161 |
+
|
162 |
+
def offload(self):
|
163 |
+
"""Offload the model and free up the memory"""
|
164 |
+
if self.model is not None:
|
165 |
+
del self.model
|
166 |
+
self.model = None
|
167 |
+
if self.device == "cuda":
|
168 |
+
torch.cuda.empty_cache()
|
169 |
+
gc.collect()
|
170 |
+
self.audio_info = None
|
171 |
+
|
172 |
+
@staticmethod
|
173 |
+
def cache_parameters(model_size: str,
|
174 |
+
segment_size: int):
|
175 |
+
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
176 |
+
cached_uvr_params = cached_params["bgm_separation"]
|
177 |
+
uvr_params_to_cache = {
|
178 |
+
"model_size": model_size,
|
179 |
+
"segment_size": segment_size
|
180 |
+
}
|
181 |
+
cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
|
182 |
+
cached_params["bgm_separation"] = cached_uvr_params
|
183 |
+
save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
|
modules/vad/__init__.py
ADDED
File without changes
|
modules/vad/silero_vad.py
ADDED
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
|
2 |
+
|
3 |
+
from faster_whisper.vad import VadOptions, get_vad_model
|
4 |
+
import numpy as np
|
5 |
+
from typing import BinaryIO, Union, List, Optional, Tuple
|
6 |
+
import warnings
|
7 |
+
import faster_whisper
|
8 |
+
from faster_whisper.transcribe import SpeechTimestampsMap, Segment
|
9 |
+
import gradio as gr
|
10 |
+
|
11 |
+
|
12 |
+
class SileroVAD:
|
13 |
+
def __init__(self):
|
14 |
+
self.sampling_rate = 16000
|
15 |
+
self.window_size_samples = 512
|
16 |
+
self.model = None
|
17 |
+
|
18 |
+
def run(self,
|
19 |
+
audio: Union[str, BinaryIO, np.ndarray],
|
20 |
+
vad_parameters: VadOptions,
|
21 |
+
progress: gr.Progress = gr.Progress()
|
22 |
+
) -> Tuple[np.ndarray, List[dict]]:
|
23 |
+
"""
|
24 |
+
Run VAD
|
25 |
+
|
26 |
+
Parameters
|
27 |
+
----------
|
28 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
29 |
+
Audio path or file binary or Audio numpy array
|
30 |
+
vad_parameters:
|
31 |
+
Options for VAD processing.
|
32 |
+
progress: gr.Progress
|
33 |
+
Indicator to show progress directly in gradio.
|
34 |
+
|
35 |
+
Returns
|
36 |
+
----------
|
37 |
+
np.ndarray
|
38 |
+
Pre-processed audio with VAD
|
39 |
+
List[dict]
|
40 |
+
Chunks of speeches to be used to restore the timestamps later
|
41 |
+
"""
|
42 |
+
|
43 |
+
sampling_rate = self.sampling_rate
|
44 |
+
|
45 |
+
if not isinstance(audio, np.ndarray):
|
46 |
+
audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
|
47 |
+
|
48 |
+
duration = audio.shape[0] / sampling_rate
|
49 |
+
duration_after_vad = duration
|
50 |
+
|
51 |
+
if vad_parameters is None:
|
52 |
+
vad_parameters = VadOptions()
|
53 |
+
elif isinstance(vad_parameters, dict):
|
54 |
+
vad_parameters = VadOptions(**vad_parameters)
|
55 |
+
speech_chunks = self.get_speech_timestamps(
|
56 |
+
audio=audio,
|
57 |
+
vad_options=vad_parameters,
|
58 |
+
progress=progress
|
59 |
+
)
|
60 |
+
audio = self.collect_chunks(audio, speech_chunks)
|
61 |
+
duration_after_vad = audio.shape[0] / sampling_rate
|
62 |
+
|
63 |
+
return audio, speech_chunks
|
64 |
+
|
65 |
+
def get_speech_timestamps(
|
66 |
+
self,
|
67 |
+
audio: np.ndarray,
|
68 |
+
vad_options: Optional[VadOptions] = None,
|
69 |
+
progress: gr.Progress = gr.Progress(),
|
70 |
+
**kwargs,
|
71 |
+
) -> List[dict]:
|
72 |
+
"""This method is used for splitting long audios into speech chunks using silero VAD.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
audio: One dimensional float array.
|
76 |
+
vad_options: Options for VAD processing.
|
77 |
+
kwargs: VAD options passed as keyword arguments for backward compatibility.
|
78 |
+
progress: Gradio progress to indicate progress.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
List of dicts containing begin and end samples of each speech chunk.
|
82 |
+
"""
|
83 |
+
|
84 |
+
if self.model is None:
|
85 |
+
self.update_model()
|
86 |
+
|
87 |
+
if vad_options is None:
|
88 |
+
vad_options = VadOptions(**kwargs)
|
89 |
+
|
90 |
+
threshold = vad_options.threshold
|
91 |
+
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
92 |
+
max_speech_duration_s = vad_options.max_speech_duration_s
|
93 |
+
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
94 |
+
window_size_samples = self.window_size_samples
|
95 |
+
speech_pad_ms = vad_options.speech_pad_ms
|
96 |
+
sampling_rate = 16000
|
97 |
+
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
98 |
+
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
99 |
+
max_speech_samples = (
|
100 |
+
sampling_rate * max_speech_duration_s
|
101 |
+
- window_size_samples
|
102 |
+
- 2 * speech_pad_samples
|
103 |
+
)
|
104 |
+
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
105 |
+
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
|
106 |
+
|
107 |
+
audio_length_samples = len(audio)
|
108 |
+
|
109 |
+
state, context = self.model.get_initial_states(batch_size=1)
|
110 |
+
|
111 |
+
speech_probs = []
|
112 |
+
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
113 |
+
progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
|
114 |
+
|
115 |
+
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
116 |
+
if len(chunk) < window_size_samples:
|
117 |
+
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
118 |
+
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
|
119 |
+
speech_probs.append(speech_prob)
|
120 |
+
|
121 |
+
triggered = False
|
122 |
+
speeches = []
|
123 |
+
current_speech = {}
|
124 |
+
neg_threshold = threshold - 0.15
|
125 |
+
|
126 |
+
# to save potential segment end (and tolerate some silence)
|
127 |
+
temp_end = 0
|
128 |
+
# to save potential segment limits in case of maximum segment size reached
|
129 |
+
prev_end = next_start = 0
|
130 |
+
|
131 |
+
for i, speech_prob in enumerate(speech_probs):
|
132 |
+
if (speech_prob >= threshold) and temp_end:
|
133 |
+
temp_end = 0
|
134 |
+
if next_start < prev_end:
|
135 |
+
next_start = window_size_samples * i
|
136 |
+
|
137 |
+
if (speech_prob >= threshold) and not triggered:
|
138 |
+
triggered = True
|
139 |
+
current_speech["start"] = window_size_samples * i
|
140 |
+
continue
|
141 |
+
|
142 |
+
if (
|
143 |
+
triggered
|
144 |
+
and (window_size_samples * i) - current_speech["start"] > max_speech_samples
|
145 |
+
):
|
146 |
+
if prev_end:
|
147 |
+
current_speech["end"] = prev_end
|
148 |
+
speeches.append(current_speech)
|
149 |
+
current_speech = {}
|
150 |
+
# previously reached silence (< neg_thres) and is still not speech (< thres)
|
151 |
+
if next_start < prev_end:
|
152 |
+
triggered = False
|
153 |
+
else:
|
154 |
+
current_speech["start"] = next_start
|
155 |
+
prev_end = next_start = temp_end = 0
|
156 |
+
else:
|
157 |
+
current_speech["end"] = window_size_samples * i
|
158 |
+
speeches.append(current_speech)
|
159 |
+
current_speech = {}
|
160 |
+
prev_end = next_start = temp_end = 0
|
161 |
+
triggered = False
|
162 |
+
continue
|
163 |
+
|
164 |
+
if (speech_prob < neg_threshold) and triggered:
|
165 |
+
if not temp_end:
|
166 |
+
temp_end = window_size_samples * i
|
167 |
+
# condition to avoid cutting in very short silence
|
168 |
+
if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
|
169 |
+
prev_end = temp_end
|
170 |
+
if (window_size_samples * i) - temp_end < min_silence_samples:
|
171 |
+
continue
|
172 |
+
else:
|
173 |
+
current_speech["end"] = temp_end
|
174 |
+
if (
|
175 |
+
current_speech["end"] - current_speech["start"]
|
176 |
+
) > min_speech_samples:
|
177 |
+
speeches.append(current_speech)
|
178 |
+
current_speech = {}
|
179 |
+
prev_end = next_start = temp_end = 0
|
180 |
+
triggered = False
|
181 |
+
continue
|
182 |
+
|
183 |
+
if (
|
184 |
+
current_speech
|
185 |
+
and (audio_length_samples - current_speech["start"]) > min_speech_samples
|
186 |
+
):
|
187 |
+
current_speech["end"] = audio_length_samples
|
188 |
+
speeches.append(current_speech)
|
189 |
+
|
190 |
+
for i, speech in enumerate(speeches):
|
191 |
+
if i == 0:
|
192 |
+
speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
|
193 |
+
if i != len(speeches) - 1:
|
194 |
+
silence_duration = speeches[i + 1]["start"] - speech["end"]
|
195 |
+
if silence_duration < 2 * speech_pad_samples:
|
196 |
+
speech["end"] += int(silence_duration // 2)
|
197 |
+
speeches[i + 1]["start"] = int(
|
198 |
+
max(0, speeches[i + 1]["start"] - silence_duration // 2)
|
199 |
+
)
|
200 |
+
else:
|
201 |
+
speech["end"] = int(
|
202 |
+
min(audio_length_samples, speech["end"] + speech_pad_samples)
|
203 |
+
)
|
204 |
+
speeches[i + 1]["start"] = int(
|
205 |
+
max(0, speeches[i + 1]["start"] - speech_pad_samples)
|
206 |
+
)
|
207 |
+
else:
|
208 |
+
speech["end"] = int(
|
209 |
+
min(audio_length_samples, speech["end"] + speech_pad_samples)
|
210 |
+
)
|
211 |
+
|
212 |
+
return speeches
|
213 |
+
|
214 |
+
def update_model(self):
|
215 |
+
self.model = get_vad_model()
|
216 |
+
|
217 |
+
@staticmethod
|
218 |
+
def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
|
219 |
+
"""Collects and concatenates audio chunks."""
|
220 |
+
if not chunks:
|
221 |
+
return np.array([], dtype=np.float32)
|
222 |
+
|
223 |
+
return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
|
224 |
+
|
225 |
+
@staticmethod
|
226 |
+
def format_timestamp(
|
227 |
+
seconds: float,
|
228 |
+
always_include_hours: bool = False,
|
229 |
+
decimal_marker: str = ".",
|
230 |
+
) -> str:
|
231 |
+
assert seconds >= 0, "non-negative timestamp expected"
|
232 |
+
milliseconds = round(seconds * 1000.0)
|
233 |
+
|
234 |
+
hours = milliseconds // 3_600_000
|
235 |
+
milliseconds -= hours * 3_600_000
|
236 |
+
|
237 |
+
minutes = milliseconds // 60_000
|
238 |
+
milliseconds -= minutes * 60_000
|
239 |
+
|
240 |
+
seconds = milliseconds // 1_000
|
241 |
+
milliseconds -= seconds * 1_000
|
242 |
+
|
243 |
+
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
244 |
+
return (
|
245 |
+
f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
|
246 |
+
)
|
247 |
+
|
248 |
+
def restore_speech_timestamps(
|
249 |
+
self,
|
250 |
+
segments: List[dict],
|
251 |
+
speech_chunks: List[dict],
|
252 |
+
sampling_rate: Optional[int] = None,
|
253 |
+
) -> List[dict]:
|
254 |
+
if sampling_rate is None:
|
255 |
+
sampling_rate = self.sampling_rate
|
256 |
+
|
257 |
+
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
|
258 |
+
|
259 |
+
for segment in segments:
|
260 |
+
segment["start"] = ts_map.get_original_time(segment["start"])
|
261 |
+
segment["end"] = ts_map.get_original_time(segment["end"])
|
262 |
+
|
263 |
+
return segments
|
264 |
+
|
modules/whisper/__init__.py
ADDED
File without changes
|
modules/whisper/faster_whisper_inference.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from typing import BinaryIO, Union, Tuple, List
|
6 |
+
import faster_whisper
|
7 |
+
from faster_whisper.vad import VadOptions
|
8 |
+
import ast
|
9 |
+
import ctranslate2
|
10 |
+
import whisper
|
11 |
+
import gradio as gr
|
12 |
+
from argparse import Namespace
|
13 |
+
|
14 |
+
from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
|
15 |
+
from modules.whisper.whisper_parameter import *
|
16 |
+
from modules.whisper.whisper_base import WhisperBase
|
17 |
+
|
18 |
+
|
19 |
+
class FasterWhisperInference(WhisperBase):
|
20 |
+
def __init__(self,
|
21 |
+
model_dir: str = FASTER_WHISPER_MODELS_DIR,
|
22 |
+
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
23 |
+
uvr_model_dir: str = UVR_MODELS_DIR,
|
24 |
+
output_dir: str = OUTPUT_DIR,
|
25 |
+
):
|
26 |
+
super().__init__(
|
27 |
+
model_dir=model_dir,
|
28 |
+
diarization_model_dir=diarization_model_dir,
|
29 |
+
uvr_model_dir=uvr_model_dir,
|
30 |
+
output_dir=output_dir
|
31 |
+
)
|
32 |
+
self.model_dir = model_dir
|
33 |
+
os.makedirs(self.model_dir, exist_ok=True)
|
34 |
+
|
35 |
+
self.model_paths = self.get_model_paths()
|
36 |
+
self.device = self.get_device()
|
37 |
+
self.available_models = self.model_paths.keys()
|
38 |
+
self.available_compute_types = ctranslate2.get_supported_compute_types(
|
39 |
+
"cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
|
40 |
+
|
41 |
+
def transcribe(self,
|
42 |
+
audio: Union[str, BinaryIO, np.ndarray],
|
43 |
+
progress: gr.Progress = gr.Progress(),
|
44 |
+
*whisper_params,
|
45 |
+
) -> Tuple[List[dict], float]:
|
46 |
+
"""
|
47 |
+
transcribe method for faster-whisper.
|
48 |
+
|
49 |
+
Parameters
|
50 |
+
----------
|
51 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
52 |
+
Audio path or file binary or Audio numpy array
|
53 |
+
progress: gr.Progress
|
54 |
+
Indicator to show progress directly in gradio.
|
55 |
+
*whisper_params: tuple
|
56 |
+
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
57 |
+
|
58 |
+
Returns
|
59 |
+
----------
|
60 |
+
segments_result: List[dict]
|
61 |
+
list of dicts that includes start, end timestamps and transcribed text
|
62 |
+
elapsed_time: float
|
63 |
+
elapsed time for transcription
|
64 |
+
"""
|
65 |
+
start_time = time.time()
|
66 |
+
|
67 |
+
params = WhisperParameters.as_value(*whisper_params)
|
68 |
+
|
69 |
+
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
70 |
+
self.update_model(params.model_size, params.compute_type, progress)
|
71 |
+
|
72 |
+
# None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
|
73 |
+
if not params.initial_prompt:
|
74 |
+
params.initial_prompt = None
|
75 |
+
if not params.prefix:
|
76 |
+
params.prefix = None
|
77 |
+
if not params.hotwords:
|
78 |
+
params.hotwords = None
|
79 |
+
|
80 |
+
params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
|
81 |
+
|
82 |
+
segments, info = self.model.transcribe(
|
83 |
+
audio=audio,
|
84 |
+
language=params.lang,
|
85 |
+
task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
|
86 |
+
beam_size=params.beam_size,
|
87 |
+
log_prob_threshold=params.log_prob_threshold,
|
88 |
+
no_speech_threshold=params.no_speech_threshold,
|
89 |
+
best_of=params.best_of,
|
90 |
+
patience=params.patience,
|
91 |
+
temperature=params.temperature,
|
92 |
+
initial_prompt=params.initial_prompt,
|
93 |
+
compression_ratio_threshold=params.compression_ratio_threshold,
|
94 |
+
length_penalty=params.length_penalty,
|
95 |
+
repetition_penalty=params.repetition_penalty,
|
96 |
+
no_repeat_ngram_size=params.no_repeat_ngram_size,
|
97 |
+
prefix=params.prefix,
|
98 |
+
suppress_blank=params.suppress_blank,
|
99 |
+
suppress_tokens=params.suppress_tokens,
|
100 |
+
max_initial_timestamp=params.max_initial_timestamp,
|
101 |
+
word_timestamps=params.word_timestamps,
|
102 |
+
prepend_punctuations=params.prepend_punctuations,
|
103 |
+
append_punctuations=params.append_punctuations,
|
104 |
+
max_new_tokens=params.max_new_tokens,
|
105 |
+
chunk_length=params.chunk_length,
|
106 |
+
hallucination_silence_threshold=params.hallucination_silence_threshold,
|
107 |
+
hotwords=params.hotwords,
|
108 |
+
language_detection_threshold=params.language_detection_threshold,
|
109 |
+
language_detection_segments=params.language_detection_segments,
|
110 |
+
prompt_reset_on_temperature=params.prompt_reset_on_temperature,
|
111 |
+
)
|
112 |
+
progress(0, desc="Loading audio..")
|
113 |
+
|
114 |
+
segments_result = []
|
115 |
+
for segment in segments:
|
116 |
+
progress(segment.start / info.duration, desc="Transcribing..")
|
117 |
+
segments_result.append({
|
118 |
+
"start": segment.start,
|
119 |
+
"end": segment.end,
|
120 |
+
"text": segment.text
|
121 |
+
})
|
122 |
+
|
123 |
+
elapsed_time = time.time() - start_time
|
124 |
+
return segments_result, elapsed_time
|
125 |
+
|
126 |
+
def update_model(self,
|
127 |
+
model_size: str,
|
128 |
+
compute_type: str,
|
129 |
+
progress: gr.Progress = gr.Progress()
|
130 |
+
):
|
131 |
+
"""
|
132 |
+
Update current model setting
|
133 |
+
|
134 |
+
Parameters
|
135 |
+
----------
|
136 |
+
model_size: str
|
137 |
+
Size of whisper model
|
138 |
+
compute_type: str
|
139 |
+
Compute type for transcription.
|
140 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
141 |
+
progress: gr.Progress
|
142 |
+
Indicator to show progress directly in gradio.
|
143 |
+
"""
|
144 |
+
progress(0, desc="Initializing Model..")
|
145 |
+
self.current_model_size = self.model_paths[model_size]
|
146 |
+
self.current_compute_type = compute_type
|
147 |
+
self.model = faster_whisper.WhisperModel(
|
148 |
+
device=self.device,
|
149 |
+
model_size_or_path=self.current_model_size,
|
150 |
+
download_root=self.model_dir,
|
151 |
+
compute_type=self.current_compute_type
|
152 |
+
)
|
153 |
+
|
154 |
+
def get_model_paths(self):
|
155 |
+
"""
|
156 |
+
Get available models from models path including fine-tuned model.
|
157 |
+
|
158 |
+
Returns
|
159 |
+
----------
|
160 |
+
Name list of models
|
161 |
+
"""
|
162 |
+
model_paths = {model:model for model in faster_whisper.available_models()}
|
163 |
+
faster_whisper_prefix = "models--Systran--faster-whisper-"
|
164 |
+
|
165 |
+
existing_models = os.listdir(self.model_dir)
|
166 |
+
wrong_dirs = [".locks"]
|
167 |
+
existing_models = list(set(existing_models) - set(wrong_dirs))
|
168 |
+
|
169 |
+
for model_name in existing_models:
|
170 |
+
if faster_whisper_prefix in model_name:
|
171 |
+
model_name = model_name[len(faster_whisper_prefix):]
|
172 |
+
|
173 |
+
if model_name not in whisper.available_models():
|
174 |
+
model_paths[model_name] = os.path.join(self.model_dir, model_name)
|
175 |
+
return model_paths
|
176 |
+
|
177 |
+
@staticmethod
|
178 |
+
def get_device():
|
179 |
+
if torch.cuda.is_available():
|
180 |
+
return "cuda"
|
181 |
+
else:
|
182 |
+
return "auto"
|
183 |
+
|
184 |
+
@staticmethod
|
185 |
+
def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
|
186 |
+
try:
|
187 |
+
suppress_tokens = ast.literal_eval(suppress_tokens_str)
|
188 |
+
if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
|
189 |
+
raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
|
190 |
+
return suppress_tokens
|
191 |
+
except Exception as e:
|
192 |
+
raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
|
modules/whisper/insanely_fast_whisper_inference.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import numpy as np
|
4 |
+
from typing import BinaryIO, Union, Tuple, List
|
5 |
+
import torch
|
6 |
+
from transformers import pipeline
|
7 |
+
from transformers.utils import is_flash_attn_2_available
|
8 |
+
import gradio as gr
|
9 |
+
from huggingface_hub import hf_hub_download
|
10 |
+
import whisper
|
11 |
+
from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
|
12 |
+
from argparse import Namespace
|
13 |
+
|
14 |
+
from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
|
15 |
+
from modules.whisper.whisper_parameter import *
|
16 |
+
from modules.whisper.whisper_base import WhisperBase
|
17 |
+
|
18 |
+
|
19 |
+
class InsanelyFastWhisperInference(WhisperBase):
|
20 |
+
def __init__(self,
|
21 |
+
model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
|
22 |
+
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
23 |
+
uvr_model_dir: str = UVR_MODELS_DIR,
|
24 |
+
output_dir: str = OUTPUT_DIR,
|
25 |
+
):
|
26 |
+
super().__init__(
|
27 |
+
model_dir=model_dir,
|
28 |
+
output_dir=output_dir,
|
29 |
+
diarization_model_dir=diarization_model_dir,
|
30 |
+
uvr_model_dir=uvr_model_dir
|
31 |
+
)
|
32 |
+
self.model_dir = model_dir
|
33 |
+
os.makedirs(self.model_dir, exist_ok=True)
|
34 |
+
|
35 |
+
openai_models = whisper.available_models()
|
36 |
+
distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
|
37 |
+
self.available_models = openai_models + distil_models
|
38 |
+
self.available_compute_types = ["float16"]
|
39 |
+
|
40 |
+
def transcribe(self,
|
41 |
+
audio: Union[str, np.ndarray, torch.Tensor],
|
42 |
+
progress: gr.Progress = gr.Progress(),
|
43 |
+
*whisper_params,
|
44 |
+
) -> Tuple[List[dict], float]:
|
45 |
+
"""
|
46 |
+
transcribe method for faster-whisper.
|
47 |
+
|
48 |
+
Parameters
|
49 |
+
----------
|
50 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
51 |
+
Audio path or file binary or Audio numpy array
|
52 |
+
progress: gr.Progress
|
53 |
+
Indicator to show progress directly in gradio.
|
54 |
+
*whisper_params: tuple
|
55 |
+
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
56 |
+
|
57 |
+
Returns
|
58 |
+
----------
|
59 |
+
segments_result: List[dict]
|
60 |
+
list of dicts that includes start, end timestamps and transcribed text
|
61 |
+
elapsed_time: float
|
62 |
+
elapsed time for transcription
|
63 |
+
"""
|
64 |
+
start_time = time.time()
|
65 |
+
params = WhisperParameters.as_value(*whisper_params)
|
66 |
+
|
67 |
+
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
68 |
+
self.update_model(params.model_size, params.compute_type, progress)
|
69 |
+
|
70 |
+
progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
|
71 |
+
with Progress(
|
72 |
+
TextColumn("[progress.description]{task.description}"),
|
73 |
+
BarColumn(style="yellow1", pulse_style="white"),
|
74 |
+
TimeElapsedColumn(),
|
75 |
+
) as progress:
|
76 |
+
progress.add_task("[yellow]Transcribing...", total=None)
|
77 |
+
|
78 |
+
kwargs = {
|
79 |
+
"no_speech_threshold": params.no_speech_threshold,
|
80 |
+
"temperature": params.temperature,
|
81 |
+
"compression_ratio_threshold": params.compression_ratio_threshold,
|
82 |
+
"logprob_threshold": params.log_prob_threshold,
|
83 |
+
}
|
84 |
+
|
85 |
+
if self.current_model_size.endswith(".en"):
|
86 |
+
pass
|
87 |
+
else:
|
88 |
+
kwargs["language"] = params.lang
|
89 |
+
kwargs["task"] = "translate" if params.is_translate else "transcribe"
|
90 |
+
|
91 |
+
segments = self.model(
|
92 |
+
inputs=audio,
|
93 |
+
return_timestamps=True,
|
94 |
+
chunk_length_s=params.chunk_length,
|
95 |
+
batch_size=params.batch_size,
|
96 |
+
generate_kwargs=kwargs
|
97 |
+
)
|
98 |
+
|
99 |
+
segments_result = self.format_result(
|
100 |
+
transcribed_result=segments,
|
101 |
+
)
|
102 |
+
elapsed_time = time.time() - start_time
|
103 |
+
return segments_result, elapsed_time
|
104 |
+
|
105 |
+
def update_model(self,
|
106 |
+
model_size: str,
|
107 |
+
compute_type: str,
|
108 |
+
progress: gr.Progress = gr.Progress(),
|
109 |
+
):
|
110 |
+
"""
|
111 |
+
Update current model setting
|
112 |
+
|
113 |
+
Parameters
|
114 |
+
----------
|
115 |
+
model_size: str
|
116 |
+
Size of whisper model
|
117 |
+
compute_type: str
|
118 |
+
Compute type for transcription.
|
119 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
120 |
+
progress: gr.Progress
|
121 |
+
Indicator to show progress directly in gradio.
|
122 |
+
"""
|
123 |
+
progress(0, desc="Initializing Model..")
|
124 |
+
model_path = os.path.join(self.model_dir, model_size)
|
125 |
+
if not os.path.isdir(model_path) or not os.listdir(model_path):
|
126 |
+
self.download_model(
|
127 |
+
model_size=model_size,
|
128 |
+
download_root=model_path,
|
129 |
+
progress=progress
|
130 |
+
)
|
131 |
+
|
132 |
+
self.current_compute_type = compute_type
|
133 |
+
self.current_model_size = model_size
|
134 |
+
self.model = pipeline(
|
135 |
+
"automatic-speech-recognition",
|
136 |
+
model=os.path.join(self.model_dir, model_size),
|
137 |
+
torch_dtype=self.current_compute_type,
|
138 |
+
device=self.device,
|
139 |
+
model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
|
140 |
+
)
|
141 |
+
|
142 |
+
@staticmethod
|
143 |
+
def format_result(
|
144 |
+
transcribed_result: dict
|
145 |
+
) -> List[dict]:
|
146 |
+
"""
|
147 |
+
Format the transcription result of insanely_fast_whisper as the same with other implementation.
|
148 |
+
|
149 |
+
Parameters
|
150 |
+
----------
|
151 |
+
transcribed_result: dict
|
152 |
+
Transcription result of the insanely_fast_whisper
|
153 |
+
|
154 |
+
Returns
|
155 |
+
----------
|
156 |
+
result: List[dict]
|
157 |
+
Formatted result as the same with other implementation
|
158 |
+
"""
|
159 |
+
result = transcribed_result["chunks"]
|
160 |
+
for item in result:
|
161 |
+
start, end = item["timestamp"][0], item["timestamp"][1]
|
162 |
+
if end is None:
|
163 |
+
end = start
|
164 |
+
item["start"] = start
|
165 |
+
item["end"] = end
|
166 |
+
return result
|
167 |
+
|
168 |
+
@staticmethod
|
169 |
+
def download_model(
|
170 |
+
model_size: str,
|
171 |
+
download_root: str,
|
172 |
+
progress: gr.Progress
|
173 |
+
):
|
174 |
+
progress(0, 'Initializing model..')
|
175 |
+
print(f'Downloading {model_size} to "{download_root}"....')
|
176 |
+
|
177 |
+
os.makedirs(download_root, exist_ok=True)
|
178 |
+
download_list = [
|
179 |
+
"model.safetensors",
|
180 |
+
"config.json",
|
181 |
+
"generation_config.json",
|
182 |
+
"preprocessor_config.json",
|
183 |
+
"tokenizer.json",
|
184 |
+
"tokenizer_config.json",
|
185 |
+
"added_tokens.json",
|
186 |
+
"special_tokens_map.json",
|
187 |
+
"vocab.json",
|
188 |
+
]
|
189 |
+
|
190 |
+
if model_size.startswith("distil"):
|
191 |
+
repo_id = f"distil-whisper/{model_size}"
|
192 |
+
else:
|
193 |
+
repo_id = f"openai/whisper-{model_size}"
|
194 |
+
for item in download_list:
|
195 |
+
hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)
|
modules/whisper/whisper_Inference.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import gradio as gr
|
3 |
+
import time
|
4 |
+
from typing import BinaryIO, Union, Tuple, List
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import os
|
8 |
+
from argparse import Namespace
|
9 |
+
|
10 |
+
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
|
11 |
+
from modules.whisper.whisper_base import WhisperBase
|
12 |
+
from modules.whisper.whisper_parameter import *
|
13 |
+
|
14 |
+
|
15 |
+
class WhisperInference(WhisperBase):
|
16 |
+
def __init__(self,
|
17 |
+
model_dir: str = WHISPER_MODELS_DIR,
|
18 |
+
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
19 |
+
uvr_model_dir: str = UVR_MODELS_DIR,
|
20 |
+
output_dir: str = OUTPUT_DIR,
|
21 |
+
):
|
22 |
+
super().__init__(
|
23 |
+
model_dir=model_dir,
|
24 |
+
output_dir=output_dir,
|
25 |
+
diarization_model_dir=diarization_model_dir,
|
26 |
+
uvr_model_dir=uvr_model_dir
|
27 |
+
)
|
28 |
+
|
29 |
+
def transcribe(self,
|
30 |
+
audio: Union[str, np.ndarray, torch.Tensor],
|
31 |
+
progress: gr.Progress = gr.Progress(),
|
32 |
+
*whisper_params,
|
33 |
+
) -> Tuple[List[dict], float]:
|
34 |
+
"""
|
35 |
+
transcribe method for faster-whisper.
|
36 |
+
|
37 |
+
Parameters
|
38 |
+
----------
|
39 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
40 |
+
Audio path or file binary or Audio numpy array
|
41 |
+
progress: gr.Progress
|
42 |
+
Indicator to show progress directly in gradio.
|
43 |
+
*whisper_params: tuple
|
44 |
+
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
45 |
+
|
46 |
+
Returns
|
47 |
+
----------
|
48 |
+
segments_result: List[dict]
|
49 |
+
list of dicts that includes start, end timestamps and transcribed text
|
50 |
+
elapsed_time: float
|
51 |
+
elapsed time for transcription
|
52 |
+
"""
|
53 |
+
start_time = time.time()
|
54 |
+
params = WhisperParameters.as_value(*whisper_params)
|
55 |
+
|
56 |
+
if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
|
57 |
+
self.update_model(params.model_size, params.compute_type, progress)
|
58 |
+
|
59 |
+
def progress_callback(progress_value):
|
60 |
+
progress(progress_value, desc="Transcribing..")
|
61 |
+
|
62 |
+
segments_result = self.model.transcribe(audio=audio,
|
63 |
+
language=params.lang,
|
64 |
+
verbose=False,
|
65 |
+
beam_size=params.beam_size,
|
66 |
+
logprob_threshold=params.log_prob_threshold,
|
67 |
+
no_speech_threshold=params.no_speech_threshold,
|
68 |
+
task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
|
69 |
+
fp16=True if params.compute_type == "float16" else False,
|
70 |
+
best_of=params.best_of,
|
71 |
+
patience=params.patience,
|
72 |
+
temperature=params.temperature,
|
73 |
+
compression_ratio_threshold=params.compression_ratio_threshold,
|
74 |
+
progress_callback=progress_callback,)["segments"]
|
75 |
+
elapsed_time = time.time() - start_time
|
76 |
+
|
77 |
+
return segments_result, elapsed_time
|
78 |
+
|
79 |
+
def update_model(self,
|
80 |
+
model_size: str,
|
81 |
+
compute_type: str,
|
82 |
+
progress: gr.Progress = gr.Progress(),
|
83 |
+
):
|
84 |
+
"""
|
85 |
+
Update current model setting
|
86 |
+
|
87 |
+
Parameters
|
88 |
+
----------
|
89 |
+
model_size: str
|
90 |
+
Size of whisper model
|
91 |
+
compute_type: str
|
92 |
+
Compute type for transcription.
|
93 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
94 |
+
progress: gr.Progress
|
95 |
+
Indicator to show progress directly in gradio.
|
96 |
+
"""
|
97 |
+
progress(0, desc="Initializing Model..")
|
98 |
+
self.current_compute_type = compute_type
|
99 |
+
self.current_model_size = model_size
|
100 |
+
self.model = whisper.load_model(
|
101 |
+
name=model_size,
|
102 |
+
device=self.device,
|
103 |
+
download_root=self.model_dir
|
104 |
+
)
|
modules/whisper/whisper_base.py
ADDED
@@ -0,0 +1,542 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import whisper
|
4 |
+
import gradio as gr
|
5 |
+
import torchaudio
|
6 |
+
from abc import ABC, abstractmethod
|
7 |
+
from typing import BinaryIO, Union, Tuple, List
|
8 |
+
import numpy as np
|
9 |
+
from datetime import datetime
|
10 |
+
from faster_whisper.vad import VadOptions
|
11 |
+
from dataclasses import astuple
|
12 |
+
|
13 |
+
from modules.uvr.music_separator import MusicSeparator
|
14 |
+
from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
|
15 |
+
UVR_MODELS_DIR)
|
16 |
+
from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
|
17 |
+
from modules.utils.youtube_manager import get_ytdata, get_ytaudio
|
18 |
+
from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
|
19 |
+
from modules.whisper.whisper_parameter import *
|
20 |
+
from modules.diarize.diarizer import Diarizer
|
21 |
+
from modules.vad.silero_vad import SileroVAD
|
22 |
+
|
23 |
+
|
24 |
+
class WhisperBase(ABC):
|
25 |
+
def __init__(self,
|
26 |
+
model_dir: str = WHISPER_MODELS_DIR,
|
27 |
+
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
28 |
+
uvr_model_dir: str = UVR_MODELS_DIR,
|
29 |
+
output_dir: str = OUTPUT_DIR,
|
30 |
+
):
|
31 |
+
self.model_dir = model_dir
|
32 |
+
self.output_dir = output_dir
|
33 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
34 |
+
os.makedirs(self.model_dir, exist_ok=True)
|
35 |
+
self.diarizer = Diarizer(
|
36 |
+
model_dir=diarization_model_dir
|
37 |
+
)
|
38 |
+
self.vad = SileroVAD()
|
39 |
+
self.music_separator = MusicSeparator(
|
40 |
+
model_dir=uvr_model_dir,
|
41 |
+
output_dir=os.path.join(output_dir, "UVR")
|
42 |
+
)
|
43 |
+
|
44 |
+
self.model = None
|
45 |
+
self.current_model_size = None
|
46 |
+
self.available_models = whisper.available_models()
|
47 |
+
self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
|
48 |
+
self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
|
49 |
+
self.device = self.get_device()
|
50 |
+
self.available_compute_types = ["float16", "float32"]
|
51 |
+
self.current_compute_type = "float16" if self.device == "cuda" else "float32"
|
52 |
+
|
53 |
+
@abstractmethod
|
54 |
+
def transcribe(self,
|
55 |
+
audio: Union[str, BinaryIO, np.ndarray],
|
56 |
+
progress: gr.Progress = gr.Progress(),
|
57 |
+
*whisper_params,
|
58 |
+
):
|
59 |
+
"""Inference whisper model to transcribe"""
|
60 |
+
pass
|
61 |
+
|
62 |
+
@abstractmethod
|
63 |
+
def update_model(self,
|
64 |
+
model_size: str,
|
65 |
+
compute_type: str,
|
66 |
+
progress: gr.Progress = gr.Progress()
|
67 |
+
):
|
68 |
+
"""Initialize whisper model"""
|
69 |
+
pass
|
70 |
+
|
71 |
+
def run(self,
|
72 |
+
audio: Union[str, BinaryIO, np.ndarray],
|
73 |
+
progress: gr.Progress = gr.Progress(),
|
74 |
+
add_timestamp: bool = True,
|
75 |
+
*whisper_params,
|
76 |
+
) -> Tuple[List[dict], float]:
|
77 |
+
"""
|
78 |
+
Run transcription with conditional pre-processing and post-processing.
|
79 |
+
The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
|
80 |
+
The diarization will be performed in post-processing, if enabled.
|
81 |
+
|
82 |
+
Parameters
|
83 |
+
----------
|
84 |
+
audio: Union[str, BinaryIO, np.ndarray]
|
85 |
+
Audio input. This can be file path or binary type.
|
86 |
+
progress: gr.Progress
|
87 |
+
Indicator to show progress directly in gradio.
|
88 |
+
add_timestamp: bool
|
89 |
+
Whether to add a timestamp at the end of the filename.
|
90 |
+
*whisper_params: tuple
|
91 |
+
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
92 |
+
|
93 |
+
Returns
|
94 |
+
----------
|
95 |
+
segments_result: List[dict]
|
96 |
+
list of dicts that includes start, end timestamps and transcribed text
|
97 |
+
elapsed_time: float
|
98 |
+
elapsed time for running
|
99 |
+
"""
|
100 |
+
params = WhisperParameters.as_value(*whisper_params)
|
101 |
+
|
102 |
+
self.cache_parameters(
|
103 |
+
whisper_params=params,
|
104 |
+
add_timestamp=add_timestamp
|
105 |
+
)
|
106 |
+
|
107 |
+
if params.lang is None:
|
108 |
+
pass
|
109 |
+
elif params.lang == "Automatic Detection":
|
110 |
+
params.lang = None
|
111 |
+
else:
|
112 |
+
language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
|
113 |
+
params.lang = language_code_dict[params.lang]
|
114 |
+
|
115 |
+
if params.is_bgm_separate:
|
116 |
+
music, audio, _ = self.music_separator.separate(
|
117 |
+
audio=audio,
|
118 |
+
model_name=params.uvr_model_size,
|
119 |
+
device=params.uvr_device,
|
120 |
+
segment_size=params.uvr_segment_size,
|
121 |
+
save_file=params.uvr_save_file,
|
122 |
+
progress=progress
|
123 |
+
)
|
124 |
+
|
125 |
+
if audio.ndim >= 2:
|
126 |
+
audio = audio.mean(axis=1)
|
127 |
+
if self.music_separator.audio_info is None:
|
128 |
+
origin_sample_rate = 16000
|
129 |
+
else:
|
130 |
+
origin_sample_rate = self.music_separator.audio_info.sample_rate
|
131 |
+
audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
|
132 |
+
|
133 |
+
if params.uvr_enable_offload:
|
134 |
+
self.music_separator.offload()
|
135 |
+
|
136 |
+
if params.vad_filter:
|
137 |
+
# Explicit value set for float('inf') from gr.Number()
|
138 |
+
if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
|
139 |
+
params.max_speech_duration_s = float('inf')
|
140 |
+
|
141 |
+
vad_options = VadOptions(
|
142 |
+
threshold=params.threshold,
|
143 |
+
min_speech_duration_ms=params.min_speech_duration_ms,
|
144 |
+
max_speech_duration_s=params.max_speech_duration_s,
|
145 |
+
min_silence_duration_ms=params.min_silence_duration_ms,
|
146 |
+
speech_pad_ms=params.speech_pad_ms
|
147 |
+
)
|
148 |
+
|
149 |
+
audio, speech_chunks = self.vad.run(
|
150 |
+
audio=audio,
|
151 |
+
vad_parameters=vad_options,
|
152 |
+
progress=progress
|
153 |
+
)
|
154 |
+
|
155 |
+
result, elapsed_time = self.transcribe(
|
156 |
+
audio,
|
157 |
+
progress,
|
158 |
+
*astuple(params)
|
159 |
+
)
|
160 |
+
|
161 |
+
if params.vad_filter:
|
162 |
+
result = self.vad.restore_speech_timestamps(
|
163 |
+
segments=result,
|
164 |
+
speech_chunks=speech_chunks,
|
165 |
+
)
|
166 |
+
|
167 |
+
if params.is_diarize:
|
168 |
+
result, elapsed_time_diarization = self.diarizer.run(
|
169 |
+
audio=audio,
|
170 |
+
use_auth_token=params.hf_token,
|
171 |
+
transcribed_result=result,
|
172 |
+
)
|
173 |
+
elapsed_time += elapsed_time_diarization
|
174 |
+
return result, elapsed_time
|
175 |
+
|
176 |
+
def transcribe_file(self,
|
177 |
+
files: Optional[List] = None,
|
178 |
+
input_folder_path: Optional[str] = None,
|
179 |
+
file_format: str = "SRT",
|
180 |
+
add_timestamp: bool = True,
|
181 |
+
progress=gr.Progress(),
|
182 |
+
*whisper_params,
|
183 |
+
) -> list:
|
184 |
+
"""
|
185 |
+
Write subtitle file from Files
|
186 |
+
|
187 |
+
Parameters
|
188 |
+
----------
|
189 |
+
files: list
|
190 |
+
List of files to transcribe from gr.Files()
|
191 |
+
input_folder_path: str
|
192 |
+
Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and
|
193 |
+
this will be used instead.
|
194 |
+
file_format: str
|
195 |
+
Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
196 |
+
add_timestamp: bool
|
197 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
|
198 |
+
progress: gr.Progress
|
199 |
+
Indicator to show progress directly in gradio.
|
200 |
+
*whisper_params: tuple
|
201 |
+
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
202 |
+
|
203 |
+
Returns
|
204 |
+
----------
|
205 |
+
result_str:
|
206 |
+
Result of transcription to return to gr.Textbox()
|
207 |
+
result_file_path:
|
208 |
+
Output file path to return to gr.Files()
|
209 |
+
"""
|
210 |
+
try:
|
211 |
+
if input_folder_path:
|
212 |
+
files = get_media_files(input_folder_path)
|
213 |
+
if isinstance(files, str):
|
214 |
+
files = [files]
|
215 |
+
if files and isinstance(files[0], gr.utils.NamedString):
|
216 |
+
files = [file.name for file in files]
|
217 |
+
|
218 |
+
files_info = {}
|
219 |
+
for file in files:
|
220 |
+
|
221 |
+
## Detect language
|
222 |
+
#model = whisper.load_model("base")
|
223 |
+
params = WhisperParameters.as_value(*whisper_params)
|
224 |
+
model = whisper.load_model(params.model_size)
|
225 |
+
mel = whisper.log_mel_spectrogram(whisper.pad_or_trim(whisper.load_audio(file))).to(model.device)
|
226 |
+
_, probs = model.detect_language(mel)
|
227 |
+
file_language = "not"
|
228 |
+
for key,value in whisper.tokenizer.LANGUAGES.items():
|
229 |
+
if key == str(max(probs, key=probs.get)):
|
230 |
+
file_language = value.capitalize()
|
231 |
+
break
|
232 |
+
|
233 |
+
transcribed_segments, time_for_task = self.run(
|
234 |
+
file,
|
235 |
+
progress,
|
236 |
+
add_timestamp,
|
237 |
+
*whisper_params,
|
238 |
+
)
|
239 |
+
|
240 |
+
file_name, file_ext = os.path.splitext(os.path.basename(file))
|
241 |
+
subtitle, file_path = self.generate_and_write_file(
|
242 |
+
file_name=file_name,
|
243 |
+
transcribed_segments=transcribed_segments,
|
244 |
+
add_timestamp=add_timestamp,
|
245 |
+
file_format=file_format,
|
246 |
+
output_dir=self.output_dir
|
247 |
+
)
|
248 |
+
|
249 |
+
files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path, "lang": file_language}
|
250 |
+
|
251 |
+
total_result = ''
|
252 |
+
total_info = ''
|
253 |
+
total_time = 0
|
254 |
+
for file_name, info in files_info.items():
|
255 |
+
total_result += f'{info["subtitle"]}'
|
256 |
+
total_time += info["time_for_task"]
|
257 |
+
#total_info += f'{info["lang"]}'
|
258 |
+
total_info += f"Language {info['lang']} detected for file '{file_name}{file_ext}'"
|
259 |
+
|
260 |
+
#result_str = f"Processing of file '{file_name}{file_ext}' done in {self.format_time(total_time)}:\n\n{total_result}"
|
261 |
+
total_info += f"\nTranscription process done in {self.format_time(total_time)}"
|
262 |
+
result_str = total_result
|
263 |
+
result_file_path = [info['path'] for info in files_info.values()]
|
264 |
+
|
265 |
+
return [result_str, result_file_path, total_info]
|
266 |
+
|
267 |
+
except Exception as e:
|
268 |
+
print(f"Error transcribing file: {e}")
|
269 |
+
finally:
|
270 |
+
self.release_cuda_memory()
|
271 |
+
|
272 |
+
def transcribe_mic(self,
|
273 |
+
mic_audio: str,
|
274 |
+
file_format: str = "SRT",
|
275 |
+
add_timestamp: bool = True,
|
276 |
+
progress=gr.Progress(),
|
277 |
+
*whisper_params,
|
278 |
+
) -> list:
|
279 |
+
"""
|
280 |
+
Write subtitle file from microphone
|
281 |
+
|
282 |
+
Parameters
|
283 |
+
----------
|
284 |
+
mic_audio: str
|
285 |
+
Audio file path from gr.Microphone()
|
286 |
+
file_format: str
|
287 |
+
Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
288 |
+
add_timestamp: bool
|
289 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
290 |
+
progress: gr.Progress
|
291 |
+
Indicator to show progress directly in gradio.
|
292 |
+
*whisper_params: tuple
|
293 |
+
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
294 |
+
|
295 |
+
Returns
|
296 |
+
----------
|
297 |
+
result_str:
|
298 |
+
Result of transcription to return to gr.Textbox()
|
299 |
+
result_file_path:
|
300 |
+
Output file path to return to gr.Files()
|
301 |
+
"""
|
302 |
+
try:
|
303 |
+
progress(0, desc="Loading Audio..")
|
304 |
+
transcribed_segments, time_for_task = self.run(
|
305 |
+
mic_audio,
|
306 |
+
progress,
|
307 |
+
add_timestamp,
|
308 |
+
*whisper_params,
|
309 |
+
)
|
310 |
+
progress(1, desc="Completed!")
|
311 |
+
|
312 |
+
subtitle, result_file_path = self.generate_and_write_file(
|
313 |
+
file_name="Mic",
|
314 |
+
transcribed_segments=transcribed_segments,
|
315 |
+
add_timestamp=add_timestamp,
|
316 |
+
file_format=file_format,
|
317 |
+
output_dir=self.output_dir
|
318 |
+
)
|
319 |
+
|
320 |
+
result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
321 |
+
return [result_str, result_file_path]
|
322 |
+
except Exception as e:
|
323 |
+
print(f"Error transcribing file: {e}")
|
324 |
+
finally:
|
325 |
+
self.release_cuda_memory()
|
326 |
+
|
327 |
+
def transcribe_youtube(self,
|
328 |
+
youtube_link: str,
|
329 |
+
file_format: str = "SRT",
|
330 |
+
add_timestamp: bool = True,
|
331 |
+
progress=gr.Progress(),
|
332 |
+
*whisper_params,
|
333 |
+
) -> list:
|
334 |
+
"""
|
335 |
+
Write subtitle file from Youtube
|
336 |
+
|
337 |
+
Parameters
|
338 |
+
----------
|
339 |
+
youtube_link: str
|
340 |
+
URL of the Youtube video to transcribe from gr.Textbox()
|
341 |
+
file_format: str
|
342 |
+
Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
|
343 |
+
add_timestamp: bool
|
344 |
+
Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
|
345 |
+
progress: gr.Progress
|
346 |
+
Indicator to show progress directly in gradio.
|
347 |
+
*whisper_params: tuple
|
348 |
+
Parameters related with whisper. This will be dealt with "WhisperParameters" data class
|
349 |
+
|
350 |
+
Returns
|
351 |
+
----------
|
352 |
+
result_str:
|
353 |
+
Result of transcription to return to gr.Textbox()
|
354 |
+
result_file_path:
|
355 |
+
Output file path to return to gr.Files()
|
356 |
+
"""
|
357 |
+
try:
|
358 |
+
progress(0, desc="Loading Audio from Youtube..")
|
359 |
+
yt = get_ytdata(youtube_link)
|
360 |
+
audio = get_ytaudio(yt)
|
361 |
+
|
362 |
+
transcribed_segments, time_for_task = self.run(
|
363 |
+
audio,
|
364 |
+
progress,
|
365 |
+
add_timestamp,
|
366 |
+
*whisper_params,
|
367 |
+
)
|
368 |
+
|
369 |
+
progress(1, desc="Completed!")
|
370 |
+
|
371 |
+
file_name = safe_filename(yt.title)
|
372 |
+
subtitle, result_file_path = self.generate_and_write_file(
|
373 |
+
file_name=file_name,
|
374 |
+
transcribed_segments=transcribed_segments,
|
375 |
+
add_timestamp=add_timestamp,
|
376 |
+
file_format=file_format,
|
377 |
+
output_dir=self.output_dir
|
378 |
+
)
|
379 |
+
result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
|
380 |
+
|
381 |
+
if os.path.exists(audio):
|
382 |
+
os.remove(audio)
|
383 |
+
|
384 |
+
return [result_str, result_file_path]
|
385 |
+
|
386 |
+
except Exception as e:
|
387 |
+
print(f"Error transcribing file: {e}")
|
388 |
+
finally:
|
389 |
+
self.release_cuda_memory()
|
390 |
+
|
391 |
+
@staticmethod
|
392 |
+
def generate_and_write_file(file_name: str,
|
393 |
+
transcribed_segments: list,
|
394 |
+
add_timestamp: bool,
|
395 |
+
file_format: str,
|
396 |
+
output_dir: str
|
397 |
+
) -> str:
|
398 |
+
"""
|
399 |
+
Writes subtitle file
|
400 |
+
|
401 |
+
Parameters
|
402 |
+
----------
|
403 |
+
file_name: str
|
404 |
+
Output file name
|
405 |
+
transcribed_segments: list
|
406 |
+
Text segments transcribed from audio
|
407 |
+
add_timestamp: bool
|
408 |
+
Determines whether to add a timestamp to the end of the filename.
|
409 |
+
file_format: str
|
410 |
+
File format to write. Supported formats: [SRT, WebVTT, txt]
|
411 |
+
output_dir: str
|
412 |
+
Directory path of the output
|
413 |
+
|
414 |
+
Returns
|
415 |
+
----------
|
416 |
+
content: str
|
417 |
+
Result of the transcription
|
418 |
+
output_path: str
|
419 |
+
output file path
|
420 |
+
"""
|
421 |
+
if add_timestamp:
|
422 |
+
timestamp = datetime.now().strftime("%m%d%H%M%S")
|
423 |
+
output_path = os.path.join(output_dir, f"{file_name} - {timestamp}")
|
424 |
+
else:
|
425 |
+
output_path = os.path.join(output_dir, f"{file_name}")
|
426 |
+
|
427 |
+
file_format = file_format.strip().lower()
|
428 |
+
if file_format == "srt":
|
429 |
+
content = get_srt(transcribed_segments)
|
430 |
+
output_path += '.srt'
|
431 |
+
|
432 |
+
elif file_format == "webvtt":
|
433 |
+
content = get_vtt(transcribed_segments)
|
434 |
+
output_path += '.vtt'
|
435 |
+
|
436 |
+
elif file_format == "txt":
|
437 |
+
content = get_txt(transcribed_segments)
|
438 |
+
output_path += '.txt'
|
439 |
+
|
440 |
+
write_file(content, output_path)
|
441 |
+
return content, output_path
|
442 |
+
|
443 |
+
@staticmethod
|
444 |
+
def format_time(elapsed_time: float) -> str:
|
445 |
+
"""
|
446 |
+
Get {hours} {minutes} {seconds} time format string
|
447 |
+
|
448 |
+
Parameters
|
449 |
+
----------
|
450 |
+
elapsed_time: str
|
451 |
+
Elapsed time for transcription
|
452 |
+
|
453 |
+
Returns
|
454 |
+
----------
|
455 |
+
Time format string
|
456 |
+
"""
|
457 |
+
hours, rem = divmod(elapsed_time, 3600)
|
458 |
+
minutes, seconds = divmod(rem, 60)
|
459 |
+
|
460 |
+
time_str = ""
|
461 |
+
if hours:
|
462 |
+
time_str += f"{hours} hours "
|
463 |
+
if minutes:
|
464 |
+
time_str += f"{minutes} minutes "
|
465 |
+
seconds = round(seconds)
|
466 |
+
time_str += f"{seconds} seconds"
|
467 |
+
|
468 |
+
return time_str.strip()
|
469 |
+
|
470 |
+
@staticmethod
|
471 |
+
def get_device():
|
472 |
+
if torch.cuda.is_available():
|
473 |
+
return "cuda"
|
474 |
+
elif torch.backends.mps.is_available():
|
475 |
+
if not WhisperBase.is_sparse_api_supported():
|
476 |
+
# Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
|
477 |
+
return "cpu"
|
478 |
+
return "mps"
|
479 |
+
else:
|
480 |
+
return "cpu"
|
481 |
+
|
482 |
+
@staticmethod
|
483 |
+
def is_sparse_api_supported():
|
484 |
+
if not torch.backends.mps.is_available():
|
485 |
+
return False
|
486 |
+
|
487 |
+
try:
|
488 |
+
device = torch.device("mps")
|
489 |
+
sparse_tensor = torch.sparse_coo_tensor(
|
490 |
+
indices=torch.tensor([[0, 1], [2, 3]]),
|
491 |
+
values=torch.tensor([1, 2]),
|
492 |
+
size=(4, 4),
|
493 |
+
device=device
|
494 |
+
)
|
495 |
+
return True
|
496 |
+
except RuntimeError:
|
497 |
+
return False
|
498 |
+
|
499 |
+
@staticmethod
|
500 |
+
def release_cuda_memory():
|
501 |
+
"""Release memory"""
|
502 |
+
if torch.cuda.is_available():
|
503 |
+
torch.cuda.empty_cache()
|
504 |
+
torch.cuda.reset_max_memory_allocated()
|
505 |
+
|
506 |
+
@staticmethod
|
507 |
+
def remove_input_files(file_paths: List[str]):
|
508 |
+
"""Remove gradio cached files"""
|
509 |
+
if not file_paths:
|
510 |
+
return
|
511 |
+
|
512 |
+
for file_path in file_paths:
|
513 |
+
if file_path and os.path.exists(file_path):
|
514 |
+
os.remove(file_path)
|
515 |
+
|
516 |
+
@staticmethod
|
517 |
+
def cache_parameters(
|
518 |
+
whisper_params: WhisperValues,
|
519 |
+
add_timestamp: bool
|
520 |
+
):
|
521 |
+
"""cache parameters to the yaml file"""
|
522 |
+
cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
|
523 |
+
cached_whisper_param = whisper_params.to_yaml()
|
524 |
+
cached_yaml = {**cached_params, **cached_whisper_param}
|
525 |
+
cached_yaml["whisper"]["add_timestamp"] = add_timestamp
|
526 |
+
|
527 |
+
save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
|
528 |
+
|
529 |
+
@staticmethod
|
530 |
+
def resample_audio(audio: Union[str, np.ndarray],
|
531 |
+
new_sample_rate: int = 16000,
|
532 |
+
original_sample_rate: Optional[int] = None,) -> np.ndarray:
|
533 |
+
"""Resamples audio to 16k sample rate, standard on Whisper model"""
|
534 |
+
if isinstance(audio, str):
|
535 |
+
audio, original_sample_rate = torchaudio.load(audio)
|
536 |
+
else:
|
537 |
+
if original_sample_rate is None:
|
538 |
+
raise ValueError("original_sample_rate must be provided when audio is numpy array.")
|
539 |
+
audio = torch.from_numpy(audio)
|
540 |
+
resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
|
541 |
+
resampled_audio = resampler(audio).numpy()
|
542 |
+
return resampled_audio
|
modules/whisper/whisper_factory.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
import os
|
3 |
+
|
4 |
+
from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
|
5 |
+
INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
|
6 |
+
from modules.whisper.faster_whisper_inference import FasterWhisperInference
|
7 |
+
from modules.whisper.whisper_Inference import WhisperInference
|
8 |
+
from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
|
9 |
+
from modules.whisper.whisper_base import WhisperBase
|
10 |
+
|
11 |
+
|
12 |
+
class WhisperFactory:
|
13 |
+
@staticmethod
|
14 |
+
def create_whisper_inference(
|
15 |
+
whisper_type: str,
|
16 |
+
whisper_model_dir: str = WHISPER_MODELS_DIR,
|
17 |
+
faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
|
18 |
+
insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
|
19 |
+
diarization_model_dir: str = DIARIZATION_MODELS_DIR,
|
20 |
+
uvr_model_dir: str = UVR_MODELS_DIR,
|
21 |
+
output_dir: str = OUTPUT_DIR,
|
22 |
+
) -> "WhisperBase":
|
23 |
+
"""
|
24 |
+
Create a whisper inference class based on the provided whisper_type.
|
25 |
+
|
26 |
+
Parameters
|
27 |
+
----------
|
28 |
+
whisper_type : str
|
29 |
+
The type of Whisper implementation to use. Supported values (case-insensitive):
|
30 |
+
- "faster-whisper": https://github.com/openai/whisper
|
31 |
+
- "whisper": https://github.com/openai/whisper
|
32 |
+
- "insanely-fast-whisper": https://github.com/Vaibhavs10/insanely-fast-whisper
|
33 |
+
whisper_model_dir : str
|
34 |
+
Directory path for the Whisper model.
|
35 |
+
faster_whisper_model_dir : str
|
36 |
+
Directory path for the Faster Whisper model.
|
37 |
+
insanely_fast_whisper_model_dir : str
|
38 |
+
Directory path for the Insanely Fast Whisper model.
|
39 |
+
diarization_model_dir : str
|
40 |
+
Directory path for the diarization model.
|
41 |
+
uvr_model_dir : str
|
42 |
+
Directory path for the UVR model.
|
43 |
+
output_dir : str
|
44 |
+
Directory path where output files will be saved.
|
45 |
+
|
46 |
+
Returns
|
47 |
+
-------
|
48 |
+
WhisperBase
|
49 |
+
An instance of the appropriate whisper inference class based on the whisper_type.
|
50 |
+
"""
|
51 |
+
# Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
|
52 |
+
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
53 |
+
|
54 |
+
whisper_type = whisper_type.lower().strip()
|
55 |
+
|
56 |
+
faster_whisper_typos = ["faster_whisper", "faster-whisper", "fasterwhisper"]
|
57 |
+
whisper_typos = ["whisper"]
|
58 |
+
insanely_fast_whisper_typos = [
|
59 |
+
"insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
|
60 |
+
"insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
|
61 |
+
]
|
62 |
+
|
63 |
+
if whisper_type in faster_whisper_typos:
|
64 |
+
return FasterWhisperInference(
|
65 |
+
model_dir=faster_whisper_model_dir,
|
66 |
+
output_dir=output_dir,
|
67 |
+
diarization_model_dir=diarization_model_dir,
|
68 |
+
uvr_model_dir=uvr_model_dir
|
69 |
+
)
|
70 |
+
elif whisper_type in whisper_typos:
|
71 |
+
return WhisperInference(
|
72 |
+
model_dir=whisper_model_dir,
|
73 |
+
output_dir=output_dir,
|
74 |
+
diarization_model_dir=diarization_model_dir,
|
75 |
+
uvr_model_dir=uvr_model_dir
|
76 |
+
)
|
77 |
+
elif whisper_type in insanely_fast_whisper_typos:
|
78 |
+
return InsanelyFastWhisperInference(
|
79 |
+
model_dir=insanely_fast_whisper_model_dir,
|
80 |
+
output_dir=output_dir,
|
81 |
+
diarization_model_dir=diarization_model_dir,
|
82 |
+
uvr_model_dir=uvr_model_dir
|
83 |
+
)
|
84 |
+
else:
|
85 |
+
return FasterWhisperInference(
|
86 |
+
model_dir=faster_whisper_model_dir,
|
87 |
+
output_dir=output_dir,
|
88 |
+
diarization_model_dir=diarization_model_dir,
|
89 |
+
uvr_model_dir=uvr_model_dir
|
90 |
+
)
|
modules/whisper/whisper_parameter.py
ADDED
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, fields
|
2 |
+
import gradio as gr
|
3 |
+
from typing import Optional, Dict
|
4 |
+
import yaml
|
5 |
+
|
6 |
+
|
7 |
+
@dataclass
|
8 |
+
class WhisperParameters:
|
9 |
+
model_size: gr.Dropdown
|
10 |
+
lang: gr.Dropdown
|
11 |
+
is_translate: gr.Checkbox
|
12 |
+
beam_size: gr.Number
|
13 |
+
log_prob_threshold: gr.Number
|
14 |
+
no_speech_threshold: gr.Number
|
15 |
+
compute_type: gr.Dropdown
|
16 |
+
best_of: gr.Number
|
17 |
+
patience: gr.Number
|
18 |
+
condition_on_previous_text: gr.Checkbox
|
19 |
+
prompt_reset_on_temperature: gr.Slider
|
20 |
+
initial_prompt: gr.Textbox
|
21 |
+
temperature: gr.Slider
|
22 |
+
compression_ratio_threshold: gr.Number
|
23 |
+
vad_filter: gr.Checkbox
|
24 |
+
threshold: gr.Slider
|
25 |
+
min_speech_duration_ms: gr.Number
|
26 |
+
max_speech_duration_s: gr.Number
|
27 |
+
min_silence_duration_ms: gr.Number
|
28 |
+
speech_pad_ms: gr.Number
|
29 |
+
batch_size: gr.Number
|
30 |
+
is_diarize: gr.Checkbox
|
31 |
+
hf_token: gr.Textbox
|
32 |
+
diarization_device: gr.Dropdown
|
33 |
+
length_penalty: gr.Number
|
34 |
+
repetition_penalty: gr.Number
|
35 |
+
no_repeat_ngram_size: gr.Number
|
36 |
+
prefix: gr.Textbox
|
37 |
+
suppress_blank: gr.Checkbox
|
38 |
+
suppress_tokens: gr.Textbox
|
39 |
+
max_initial_timestamp: gr.Number
|
40 |
+
word_timestamps: gr.Checkbox
|
41 |
+
prepend_punctuations: gr.Textbox
|
42 |
+
append_punctuations: gr.Textbox
|
43 |
+
max_new_tokens: gr.Number
|
44 |
+
chunk_length: gr.Number
|
45 |
+
hallucination_silence_threshold: gr.Number
|
46 |
+
hotwords: gr.Textbox
|
47 |
+
language_detection_threshold: gr.Number
|
48 |
+
language_detection_segments: gr.Number
|
49 |
+
is_bgm_separate: gr.Checkbox
|
50 |
+
uvr_model_size: gr.Dropdown
|
51 |
+
uvr_device: gr.Dropdown
|
52 |
+
uvr_segment_size: gr.Number
|
53 |
+
uvr_save_file: gr.Checkbox
|
54 |
+
uvr_enable_offload: gr.Checkbox
|
55 |
+
"""
|
56 |
+
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
57 |
+
This data class is used to mitigate the key-value problem between Gradio components and function parameters.
|
58 |
+
Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
|
59 |
+
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
60 |
+
|
61 |
+
Attributes
|
62 |
+
----------
|
63 |
+
model_size: gr.Dropdown
|
64 |
+
Whisper model size.
|
65 |
+
|
66 |
+
lang: gr.Dropdown
|
67 |
+
Source language of the file to transcribe.
|
68 |
+
|
69 |
+
is_translate: gr.Checkbox
|
70 |
+
Boolean value that determines whether to translate to English.
|
71 |
+
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
72 |
+
|
73 |
+
beam_size: gr.Number
|
74 |
+
Int value that is used for decoding option.
|
75 |
+
|
76 |
+
log_prob_threshold: gr.Number
|
77 |
+
If the average log probability over sampled tokens is below this value, treat as failed.
|
78 |
+
|
79 |
+
no_speech_threshold: gr.Number
|
80 |
+
If the no_speech probability is higher than this value AND
|
81 |
+
the average log probability over sampled tokens is below `log_prob_threshold`,
|
82 |
+
consider the segment as silent.
|
83 |
+
|
84 |
+
compute_type: gr.Dropdown
|
85 |
+
compute type for transcription.
|
86 |
+
see more info : https://opennmt.net/CTranslate2/quantization.html
|
87 |
+
|
88 |
+
best_of: gr.Number
|
89 |
+
Number of candidates when sampling with non-zero temperature.
|
90 |
+
|
91 |
+
patience: gr.Number
|
92 |
+
Beam search patience factor.
|
93 |
+
|
94 |
+
condition_on_previous_text: gr.Checkbox
|
95 |
+
if True, the previous output of the model is provided as a prompt for the next window;
|
96 |
+
disabling may make the text inconsistent across windows, but the model becomes less prone to
|
97 |
+
getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
|
98 |
+
|
99 |
+
initial_prompt: gr.Textbox
|
100 |
+
Optional text to provide as a prompt for the first window. This can be used to provide, or
|
101 |
+
"prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
|
102 |
+
to make it more likely to predict those word correctly.
|
103 |
+
|
104 |
+
temperature: gr.Slider
|
105 |
+
Temperature for sampling. It can be a tuple of temperatures,
|
106 |
+
which will be successively used upon failures according to either
|
107 |
+
`compression_ratio_threshold` or `log_prob_threshold`.
|
108 |
+
|
109 |
+
compression_ratio_threshold: gr.Number
|
110 |
+
If the gzip compression ratio is above this value, treat as failed
|
111 |
+
|
112 |
+
vad_filter: gr.Checkbox
|
113 |
+
Enable the voice activity detection (VAD) to filter out parts of the audio
|
114 |
+
without speech. This step is using the Silero VAD model
|
115 |
+
https://github.com/snakers4/silero-vad.
|
116 |
+
|
117 |
+
threshold: gr.Slider
|
118 |
+
This parameter is related with Silero VAD. Speech threshold.
|
119 |
+
Silero VAD outputs speech probabilities for each audio chunk,
|
120 |
+
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
|
121 |
+
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
122 |
+
|
123 |
+
min_speech_duration_ms: gr.Number
|
124 |
+
This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
|
125 |
+
|
126 |
+
max_speech_duration_s: gr.Number
|
127 |
+
This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
|
128 |
+
than max_speech_duration_s will be split at the timestamp of the last silence that
|
129 |
+
lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
|
130 |
+
split aggressively just before max_speech_duration_s.
|
131 |
+
|
132 |
+
min_silence_duration_ms: gr.Number
|
133 |
+
This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
|
134 |
+
before separating it
|
135 |
+
|
136 |
+
speech_pad_ms: gr.Number
|
137 |
+
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
138 |
+
|
139 |
+
batch_size: gr.Number
|
140 |
+
This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
|
141 |
+
|
142 |
+
is_diarize: gr.Checkbox
|
143 |
+
This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
|
144 |
+
|
145 |
+
hf_token: gr.Textbox
|
146 |
+
This parameter is related with whisperx. Huggingface token is needed to download diarization models.
|
147 |
+
Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
|
148 |
+
|
149 |
+
diarization_device: gr.Dropdown
|
150 |
+
This parameter is related with whisperx. Device to run diarization model
|
151 |
+
|
152 |
+
length_penalty: gr.Number
|
153 |
+
This parameter is related to faster-whisper. Exponential length penalty constant.
|
154 |
+
|
155 |
+
repetition_penalty: gr.Number
|
156 |
+
This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
|
157 |
+
(set > 1 to penalize).
|
158 |
+
|
159 |
+
no_repeat_ngram_size: gr.Number
|
160 |
+
This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
|
161 |
+
|
162 |
+
prefix: gr.Textbox
|
163 |
+
This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
|
164 |
+
|
165 |
+
suppress_blank: gr.Checkbox
|
166 |
+
This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
|
167 |
+
|
168 |
+
suppress_tokens: gr.Textbox
|
169 |
+
This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
|
170 |
+
of symbols as defined in the model config.json file.
|
171 |
+
|
172 |
+
max_initial_timestamp: gr.Number
|
173 |
+
This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
|
174 |
+
|
175 |
+
word_timestamps: gr.Checkbox
|
176 |
+
This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
|
177 |
+
and dynamic time warping, and include the timestamps for each word in each segment.
|
178 |
+
|
179 |
+
prepend_punctuations: gr.Textbox
|
180 |
+
This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
|
181 |
+
with the next word.
|
182 |
+
|
183 |
+
append_punctuations: gr.Textbox
|
184 |
+
This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
|
185 |
+
with the previous word.
|
186 |
+
|
187 |
+
max_new_tokens: gr.Number
|
188 |
+
This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
|
189 |
+
the maximum will be set by the default max_length.
|
190 |
+
|
191 |
+
chunk_length: gr.Number
|
192 |
+
This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
|
193 |
+
If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
|
194 |
+
|
195 |
+
hallucination_silence_threshold: gr.Number
|
196 |
+
This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
|
197 |
+
(in seconds) when a possible hallucination is detected.
|
198 |
+
|
199 |
+
hotwords: gr.Textbox
|
200 |
+
This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
|
201 |
+
|
202 |
+
language_detection_threshold: gr.Number
|
203 |
+
This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
|
204 |
+
|
205 |
+
language_detection_segments: gr.Number
|
206 |
+
This parameter is related to faster-whisper. Number of segments to consider for the language detection.
|
207 |
+
|
208 |
+
is_separate_bgm: gr.Checkbox
|
209 |
+
This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
|
210 |
+
|
211 |
+
uvr_model_size: gr.Dropdown
|
212 |
+
This parameter is related to UVR. UVR model size.
|
213 |
+
|
214 |
+
uvr_device: gr.Dropdown
|
215 |
+
This parameter is related to UVR. Device to run UVR model.
|
216 |
+
|
217 |
+
uvr_segment_size: gr.Number
|
218 |
+
This parameter is related to UVR. Segment size for UVR model.
|
219 |
+
|
220 |
+
uvr_save_file: gr.Checkbox
|
221 |
+
This parameter is related to UVR. Boolean value that determines whether to save the file or not.
|
222 |
+
|
223 |
+
uvr_enable_offload: gr.Checkbox
|
224 |
+
This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
|
225 |
+
after each transcription.
|
226 |
+
"""
|
227 |
+
|
228 |
+
def as_list(self) -> list:
|
229 |
+
"""
|
230 |
+
Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
|
231 |
+
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
232 |
+
|
233 |
+
Returns
|
234 |
+
----------
|
235 |
+
A list of Gradio components
|
236 |
+
"""
|
237 |
+
return [getattr(self, f.name) for f in fields(self)]
|
238 |
+
|
239 |
+
@staticmethod
|
240 |
+
def as_value(*args) -> 'WhisperValues':
|
241 |
+
"""
|
242 |
+
To use Whisper parameters in function after Gradio post-processing.
|
243 |
+
See more about Gradio post-processing: : https://www.gradio.app/docs/components
|
244 |
+
|
245 |
+
Returns
|
246 |
+
----------
|
247 |
+
WhisperValues
|
248 |
+
Data class that has values of parameters
|
249 |
+
"""
|
250 |
+
return WhisperValues(*args)
|
251 |
+
|
252 |
+
|
253 |
+
@dataclass
|
254 |
+
class WhisperValues:
|
255 |
+
model_size: str = "large-v2"
|
256 |
+
lang: Optional[str] = None
|
257 |
+
is_translate: bool = False
|
258 |
+
beam_size: int = 5
|
259 |
+
log_prob_threshold: float = -1.0
|
260 |
+
no_speech_threshold: float = 0.6
|
261 |
+
compute_type: str = "float16"
|
262 |
+
best_of: int = 5
|
263 |
+
patience: float = 1.0
|
264 |
+
condition_on_previous_text: bool = True
|
265 |
+
prompt_reset_on_temperature: float = 0.5
|
266 |
+
initial_prompt: Optional[str] = None
|
267 |
+
temperature: float = 0.0
|
268 |
+
compression_ratio_threshold: float = 2.4
|
269 |
+
vad_filter: bool = False
|
270 |
+
threshold: float = 0.5
|
271 |
+
min_speech_duration_ms: int = 250
|
272 |
+
max_speech_duration_s: float = float("inf")
|
273 |
+
min_silence_duration_ms: int = 2000
|
274 |
+
speech_pad_ms: int = 400
|
275 |
+
batch_size: int = 24
|
276 |
+
is_diarize: bool = False
|
277 |
+
hf_token: str = ""
|
278 |
+
diarization_device: str = "cuda"
|
279 |
+
length_penalty: float = 1.0
|
280 |
+
repetition_penalty: float = 1.0
|
281 |
+
no_repeat_ngram_size: int = 0
|
282 |
+
prefix: Optional[str] = None
|
283 |
+
suppress_blank: bool = True
|
284 |
+
suppress_tokens: Optional[str] = "[-1]"
|
285 |
+
max_initial_timestamp: float = 0.0
|
286 |
+
word_timestamps: bool = False
|
287 |
+
prepend_punctuations: Optional[str] = "\"'“¿([{-"
|
288 |
+
append_punctuations: Optional[str] = "\"'.。,,!!??::”)]}、"
|
289 |
+
max_new_tokens: Optional[int] = None
|
290 |
+
chunk_length: Optional[int] = 30
|
291 |
+
hallucination_silence_threshold: Optional[float] = None
|
292 |
+
hotwords: Optional[str] = None
|
293 |
+
language_detection_threshold: Optional[float] = None
|
294 |
+
language_detection_segments: int = 1
|
295 |
+
is_bgm_separate: bool = False
|
296 |
+
uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
|
297 |
+
uvr_device: str = "cuda"
|
298 |
+
uvr_segment_size: int = 256
|
299 |
+
uvr_save_file: bool = False
|
300 |
+
uvr_enable_offload: bool = True
|
301 |
+
"""
|
302 |
+
A data class to use Whisper parameters.
|
303 |
+
"""
|
304 |
+
|
305 |
+
def to_yaml(self) -> Dict:
|
306 |
+
data = {
|
307 |
+
"whisper": {
|
308 |
+
"model_size": self.model_size,
|
309 |
+
"lang": "Automatic Detection" if self.lang is None else self.lang,
|
310 |
+
"is_translate": self.is_translate,
|
311 |
+
"beam_size": self.beam_size,
|
312 |
+
"log_prob_threshold": self.log_prob_threshold,
|
313 |
+
"no_speech_threshold": self.no_speech_threshold,
|
314 |
+
"best_of": self.best_of,
|
315 |
+
"patience": self.patience,
|
316 |
+
"condition_on_previous_text": self.condition_on_previous_text,
|
317 |
+
"prompt_reset_on_temperature": self.prompt_reset_on_temperature,
|
318 |
+
"initial_prompt": None if not self.initial_prompt else self.initial_prompt,
|
319 |
+
"temperature": self.temperature,
|
320 |
+
"compression_ratio_threshold": self.compression_ratio_threshold,
|
321 |
+
"batch_size": self.batch_size,
|
322 |
+
"length_penalty": self.length_penalty,
|
323 |
+
"repetition_penalty": self.repetition_penalty,
|
324 |
+
"no_repeat_ngram_size": self.no_repeat_ngram_size,
|
325 |
+
"prefix": None if not self.prefix else self.prefix,
|
326 |
+
"suppress_blank": self.suppress_blank,
|
327 |
+
"suppress_tokens": self.suppress_tokens,
|
328 |
+
"max_initial_timestamp": self.max_initial_timestamp,
|
329 |
+
"word_timestamps": self.word_timestamps,
|
330 |
+
"prepend_punctuations": self.prepend_punctuations,
|
331 |
+
"append_punctuations": self.append_punctuations,
|
332 |
+
"max_new_tokens": self.max_new_tokens,
|
333 |
+
"chunk_length": self.chunk_length,
|
334 |
+
"hallucination_silence_threshold": self.hallucination_silence_threshold,
|
335 |
+
"hotwords": None if not self.hotwords else self.hotwords,
|
336 |
+
"language_detection_threshold": self.language_detection_threshold,
|
337 |
+
"language_detection_segments": self.language_detection_segments,
|
338 |
+
},
|
339 |
+
"vad": {
|
340 |
+
"vad_filter": self.vad_filter,
|
341 |
+
"threshold": self.threshold,
|
342 |
+
"min_speech_duration_ms": self.min_speech_duration_ms,
|
343 |
+
"max_speech_duration_s": self.max_speech_duration_s,
|
344 |
+
"min_silence_duration_ms": self.min_silence_duration_ms,
|
345 |
+
"speech_pad_ms": self.speech_pad_ms,
|
346 |
+
},
|
347 |
+
"diarization": {
|
348 |
+
"is_diarize": self.is_diarize,
|
349 |
+
"hf_token": self.hf_token
|
350 |
+
},
|
351 |
+
"bgm_separation": {
|
352 |
+
"is_separate_bgm": self.is_bgm_separate,
|
353 |
+
"model_size": self.uvr_model_size,
|
354 |
+
"segment_size": self.uvr_segment_size,
|
355 |
+
"save_file": self.uvr_save_file,
|
356 |
+
"enable_offload": self.uvr_enable_offload
|
357 |
+
},
|
358 |
+
}
|
359 |
+
return data
|
360 |
+
|
361 |
+
def as_list(self) -> list:
|
362 |
+
"""
|
363 |
+
Converts the data class attributes into a list
|
364 |
+
|
365 |
+
Returns
|
366 |
+
----------
|
367 |
+
A list of Whisper parameters
|
368 |
+
"""
|
369 |
+
return [getattr(self, f.name) for f in fields(self)]
|
notebook/whisper-webui.ipynb
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"source": [
|
6 |
+
"---\n",
|
7 |
+
"\n",
|
8 |
+
"📌 **This notebook has been updated [here](https://github.com/jhj0517/Whisper-WebUI.git)!**\n",
|
9 |
+
"\n",
|
10 |
+
"🖋 **Author**: [jhj0517](https://github.com/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)\n",
|
11 |
+
"\n",
|
12 |
+
"😎 **Support the Project**:\n",
|
13 |
+
"\n",
|
14 |
+
"If you find this project useful, please consider supporting it:\n",
|
15 |
+
"\n",
|
16 |
+
"<a href=\"https://ko-fi.com/jhj0517\" target=\"_blank\">\n",
|
17 |
+
" <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
|
18 |
+
"</a>\n",
|
19 |
+
"\n",
|
20 |
+
"---"
|
21 |
+
],
|
22 |
+
"metadata": {
|
23 |
+
"id": "doKhBBXIfS21"
|
24 |
+
}
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"source": [
|
29 |
+
"#@title #(Optional) Check GPU\n",
|
30 |
+
"#@markdown Some models may not function correctly on a CPU runtime.\n",
|
31 |
+
"\n",
|
32 |
+
"#@markdown so you should check your GPU setup before run.\n",
|
33 |
+
"!nvidia-smi"
|
34 |
+
],
|
35 |
+
"metadata": {
|
36 |
+
"id": "23yZvUlagEsx",
|
37 |
+
"cellView": "form"
|
38 |
+
},
|
39 |
+
"execution_count": null,
|
40 |
+
"outputs": []
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"cell_type": "code",
|
44 |
+
"execution_count": null,
|
45 |
+
"metadata": {
|
46 |
+
"id": "kNbSbsctxahq",
|
47 |
+
"cellView": "form"
|
48 |
+
},
|
49 |
+
"outputs": [],
|
50 |
+
"source": [
|
51 |
+
"#@title #Installation\n",
|
52 |
+
"#@markdown This cell will install dependencies for Whisper-WebUI!\n",
|
53 |
+
"!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
|
54 |
+
"%cd Whisper-WebUI\n",
|
55 |
+
"!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
|
56 |
+
"!pip install faster-whisper==1.0.3\n",
|
57 |
+
"!pip install gradio==4.43.0\n",
|
58 |
+
"# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
|
59 |
+
"!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
|
60 |
+
"!pip install tokenizers==0.19.1\n",
|
61 |
+
"!pip install pyannote.audio==3.3.1\n",
|
62 |
+
"!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"source": [
|
68 |
+
"#@title # (Optional) Configure arguments\n",
|
69 |
+
"#@markdown This section is used to configure some command line arguments.\n",
|
70 |
+
"\n",
|
71 |
+
"#@markdown You can simply ignore this section and the default values will be used.\n",
|
72 |
+
"\n",
|
73 |
+
"USERNAME = '' #@param {type: \"string\"}\n",
|
74 |
+
"PASSWORD = '' #@param {type: \"string\"}\n",
|
75 |
+
"WHISPER_TYPE = 'faster-whisper' # @param [\"whisper\", \"faster-whisper\", \"insanely-fast-whisper\"]\n",
|
76 |
+
"THEME = '' #@param {type: \"string\"}\n",
|
77 |
+
"\n",
|
78 |
+
"arguments = \"\"\n",
|
79 |
+
"if USERNAME:\n",
|
80 |
+
" arguments += f\" --username {USERNAME}\"\n",
|
81 |
+
"if PASSWORD:\n",
|
82 |
+
" arguments += f\" --password {PASSWORD}\"\n",
|
83 |
+
"if THEME:\n",
|
84 |
+
" arguments += f\" --theme {THEME}\"\n",
|
85 |
+
"if WHISPER_TYPE:\n",
|
86 |
+
" arguments += f\" --whisper_type {WHISPER_TYPE}\"\n",
|
87 |
+
"\n",
|
88 |
+
"\n",
|
89 |
+
"#@markdown If you wonder how these arguments are used, you can see the [Wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments)."
|
90 |
+
],
|
91 |
+
"metadata": {
|
92 |
+
"id": "Qosz9BFlGui3",
|
93 |
+
"cellView": "form"
|
94 |
+
},
|
95 |
+
"execution_count": null,
|
96 |
+
"outputs": []
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 3,
|
101 |
+
"metadata": {
|
102 |
+
"id": "PQroYRRZzQiN",
|
103 |
+
"cellView": "form"
|
104 |
+
},
|
105 |
+
"outputs": [],
|
106 |
+
"source": [
|
107 |
+
"#@title #Run\n",
|
108 |
+
"#@markdown Once the installation is complete, you can use public URL that is displayed.\n",
|
109 |
+
"if 'arguments' in locals():\n",
|
110 |
+
" !python app.py --share --colab{arguments}\n",
|
111 |
+
"else:\n",
|
112 |
+
" !python app.py --share --colab"
|
113 |
+
]
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"metadata": {
|
117 |
+
"colab": {
|
118 |
+
"provenance": [],
|
119 |
+
"gpuType": "T4"
|
120 |
+
},
|
121 |
+
"kernelspec": {
|
122 |
+
"display_name": "Python 3",
|
123 |
+
"name": "python3"
|
124 |
+
},
|
125 |
+
"language_info": {
|
126 |
+
"name": "python"
|
127 |
+
},
|
128 |
+
"accelerator": "GPU"
|
129 |
+
},
|
130 |
+
"nbformat": 4,
|
131 |
+
"nbformat_minor": 0
|
132 |
+
}
|
outputs/outputs are saved here.txt
ADDED
File without changes
|
outputs/translations/outputs for translation are saved here.txt
ADDED
File without changes
|