Spaces:
Configuration error
Configuration error
Upload 12 files
Browse files- .gitignore +11 -0
- LICENSE +20 -0
- MANIFEST.in +2 -0
- README.md +95 -13
- app.py +29 -0
- batch.lst +3 -0
- cog.yaml +43 -0
- inference.py +235 -0
- predict.py +53 -0
- requirements.txt +16 -0
- setup.py +162 -0
- visualization.png +0 -0
.gitignore
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
__pycache__
|
3 |
+
test.py
|
4 |
+
flagged
|
5 |
+
output
|
6 |
+
gradio_cached*
|
7 |
+
*egg-info
|
8 |
+
build*
|
9 |
+
*.wav
|
10 |
+
dist/
|
11 |
+
*DS_Store*
|
LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2012-2023 Scott Chacon and others
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining
|
4 |
+
a copy of this software and associated documentation files (the
|
5 |
+
"Software"), to deal in the Software without restriction, including
|
6 |
+
without limitation the rights to use, copy, modify, merge, publish,
|
7 |
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8 |
+
permit persons to whom the Software is furnished to do so, subject to
|
9 |
+
the following conditions:
|
10 |
+
|
11 |
+
The above copyright notice and this permission notice shall be
|
12 |
+
included in all copies or substantial portions of the Software.
|
13 |
+
|
14 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17 |
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18 |
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19 |
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20 |
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
MANIFEST.in
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
include *.py LICENSE README.md
|
2 |
+
recursive-include audiosr *.txt *.py *.gz *.npy *.json
|
README.md
CHANGED
@@ -1,13 +1,95 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# AudioSR: Versatile Audio Super-resolution at Scale
|
3 |
+
|
4 |
+
[![arXiv](https://img.shields.io/badge/arXiv-2309.07314-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2309.07314) [![githubio](https://img.shields.io/badge/GitHub.io-Audio_Samples-blue?logo=Github&style=flat-square)](https://audioldm.github.io/audiosr) [![Replicate](https://replicate.com/nateraw/audio-super-resolution/badge)](https://replicate.com/nateraw/audio-super-resolution)
|
5 |
+
|
6 |
+
Pass your audio in, AudioSR will make it high fidelity!
|
7 |
+
|
8 |
+
Work on all types of audio (e.g., music, speech, dog, raining, ...) & all sampling rates.
|
9 |
+
|
10 |
+
Share your thoughts/samples/issues in our discord channel: https://discord.gg/HWeBsJryaf
|
11 |
+
|
12 |
+
![Image Description](https://github.com/haoheliu/versatile_audio_super_resolution/blob/main/visualization.png?raw=true)
|
13 |
+
|
14 |
+
## Change Log
|
15 |
+
- 2024-12-16: Add [Important things to know to make AudioSR work](example/how_to_make_audiosr_work.md).
|
16 |
+
- 2023-09-24: Add replicate demo (@nateraw); Fix error on windows, librosa warning etc (@ORI-Muchim).
|
17 |
+
- 2023-09-16: Fix DC shift issue. Fix duration padding bug. Update default DDIM steps to 50.
|
18 |
+
|
19 |
+
## Gradio Demo
|
20 |
+
|
21 |
+
To run the Gradio demo locally:
|
22 |
+
|
23 |
+
1. Install dependencies: `pip install -r requirements.txt`
|
24 |
+
2. Run the app: `python app.py`
|
25 |
+
3. Open the URL displayed to view the demo
|
26 |
+
|
27 |
+
## Commandline Usage
|
28 |
+
|
29 |
+
## Installation
|
30 |
+
```shell
|
31 |
+
# Optional
|
32 |
+
conda create -n audiosr python=3.9; conda activate audiosr
|
33 |
+
# Install AudioLDM
|
34 |
+
pip3 install audiosr==0.0.7
|
35 |
+
# or
|
36 |
+
# pip3 install git+https://github.com/haoheliu/versatile_audio_super_resolution.git
|
37 |
+
```
|
38 |
+
|
39 |
+
## Usage
|
40 |
+
|
41 |
+
Process a list of files. The result will be saved at ./output by default.
|
42 |
+
|
43 |
+
```shell
|
44 |
+
audiosr -il batch.lst
|
45 |
+
```
|
46 |
+
|
47 |
+
Process a single audio file.
|
48 |
+
```shell
|
49 |
+
audiosr -i example/music.wav
|
50 |
+
```
|
51 |
+
|
52 |
+
Full usage instruction
|
53 |
+
|
54 |
+
```shell
|
55 |
+
> audiosr -h
|
56 |
+
|
57 |
+
> usage: audiosr [-h] -i INPUT_AUDIO_FILE [-il INPUT_FILE_LIST] [-s SAVE_PATH] [--model_name {basic,speech}] [-d DEVICE] [--ddim_steps DDIM_STEPS] [-gs GUIDANCE_SCALE] [--seed SEED]
|
58 |
+
|
59 |
+
optional arguments:
|
60 |
+
-h, --help show this help message and exit
|
61 |
+
-i INPUT_AUDIO_FILE, --input_audio_file INPUT_AUDIO_FILE
|
62 |
+
Input audio file for audio super resolution
|
63 |
+
-il INPUT_FILE_LIST, --input_file_list INPUT_FILE_LIST
|
64 |
+
A file that contains all audio files that need to perform audio super resolution
|
65 |
+
-s SAVE_PATH, --save_path SAVE_PATH
|
66 |
+
The path to save model output
|
67 |
+
--model_name {basic,speech}
|
68 |
+
The checkpoint you gonna use
|
69 |
+
-d DEVICE, --device DEVICE
|
70 |
+
The device for computation. If not specified, the script will automatically choose the device based on your environment.
|
71 |
+
--ddim_steps DDIM_STEPS
|
72 |
+
The sampling step for DDIM
|
73 |
+
-gs GUIDANCE_SCALE, --guidance_scale GUIDANCE_SCALE
|
74 |
+
Guidance scale (Large => better quality and relavancy to text; Small => better diversity)
|
75 |
+
--seed SEED Change this value (any integer number) will lead to a different generation result.
|
76 |
+
--suffix SUFFIX Suffix for the output file
|
77 |
+
```
|
78 |
+
|
79 |
+
|
80 |
+
## TODO
|
81 |
+
[!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://www.buymeacoffee.com/haoheliuP)
|
82 |
+
|
83 |
+
- [ ] Add gradio demo.
|
84 |
+
- [ ] Optimize the inference speed.
|
85 |
+
|
86 |
+
## Cite our work
|
87 |
+
If you find this repo useful, please consider citing:
|
88 |
+
```bibtex
|
89 |
+
@article{liu2023audiosr,
|
90 |
+
title={{AudioSR}: Versatile Audio Super-resolution at Scale},
|
91 |
+
author={Liu, Haohe and Chen, Ke and Tian, Qiao and Wang, Wenwu and Plumbley, Mark D},
|
92 |
+
journal={arXiv preprint arXiv:2309.07314},
|
93 |
+
year={2023}
|
94 |
+
}
|
95 |
+
```
|
app.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from audiosr import super_resolution, build_model
|
3 |
+
|
4 |
+
def inference(audio_file, model_name, guidance_scale, ddim_steps):
|
5 |
+
audiosr = build_model(model_name=model_name)
|
6 |
+
|
7 |
+
waveform = super_resolution(
|
8 |
+
audiosr,
|
9 |
+
audio_file,
|
10 |
+
guidance_scale=guidance_scale,
|
11 |
+
ddim_steps=ddim_steps
|
12 |
+
)
|
13 |
+
|
14 |
+
return (44100, waveform)
|
15 |
+
|
16 |
+
iface = gr.Interface(
|
17 |
+
fn=inference,
|
18 |
+
inputs=[
|
19 |
+
gr.Audio(type="filepath", label="Input Audio"),
|
20 |
+
gr.Dropdown(["basic", "speech"], value="basic", label="Model"),
|
21 |
+
gr.Slider(1, 10, value=3.5, step=0.1, label="Guidance Scale"),
|
22 |
+
gr.Slider(1, 100, value=50, step=1, label="DDIM Steps")
|
23 |
+
],
|
24 |
+
outputs=gr.Audio(type="numpy", label="Output Audio"),
|
25 |
+
title="AudioSR",
|
26 |
+
description="Audio Super Resolution with AudioSR"
|
27 |
+
)
|
28 |
+
|
29 |
+
iface.launch()
|
batch.lst
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
example/music.wav
|
2 |
+
example/speech.wav
|
3 |
+
example/sound_effect.wav
|
cog.yaml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Configuration for Cog ⚙️
|
2 |
+
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
|
3 |
+
|
4 |
+
build:
|
5 |
+
# set to true if your model requires a GPU
|
6 |
+
gpu: true
|
7 |
+
cuda: "11.7"
|
8 |
+
|
9 |
+
# a list of ubuntu apt packages to install
|
10 |
+
system_packages:
|
11 |
+
- "ffmpeg"
|
12 |
+
- "libsndfile1"
|
13 |
+
|
14 |
+
# python version in the form '3.8' or '3.8.12'
|
15 |
+
python_version: "3.9"
|
16 |
+
|
17 |
+
# a list of packages in the format <package-name>==<version>
|
18 |
+
python_packages:
|
19 |
+
- "torch==2.0.1"
|
20 |
+
- "torchaudio==2.0.2"
|
21 |
+
- "torchvision==0.15.2"
|
22 |
+
- "tqdm==4.66.1"
|
23 |
+
- "gradio==3.44.4"
|
24 |
+
- "pyyaml==6.0.1"
|
25 |
+
- "einops==0.6.1"
|
26 |
+
- "chardet==5.2.0"
|
27 |
+
- "numpy==1.23.5"
|
28 |
+
- "soundfile==0.12.1"
|
29 |
+
- "librosa==0.9.2"
|
30 |
+
- "scipy==1.11.2"
|
31 |
+
- "pandas==2.1.0"
|
32 |
+
- "unidecode==1.3.6"
|
33 |
+
- "phonemizer==3.2.1"
|
34 |
+
- "torchlibrosa==0.1.0"
|
35 |
+
- "transformers==4.30.2"
|
36 |
+
- "huggingface_hub==0.17.2"
|
37 |
+
- "progressbar==2.5"
|
38 |
+
- "ftfy==6.1.1"
|
39 |
+
- "timm==0.9.7"
|
40 |
+
- "audiosr==0.0.7"
|
41 |
+
|
42 |
+
# predict.py defines how predictions are run on your model
|
43 |
+
predict: "predict.py:Predictor"
|
inference.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import numpy as np
|
5 |
+
from scipy.signal.windows import hann
|
6 |
+
import soundfile as sf
|
7 |
+
import torch
|
8 |
+
from cog import BasePredictor, Input, Path
|
9 |
+
import tempfile
|
10 |
+
import argparse
|
11 |
+
import librosa
|
12 |
+
from audiosr import build_model, super_resolution
|
13 |
+
from scipy import signal
|
14 |
+
import pyloudnorm as pyln
|
15 |
+
|
16 |
+
|
17 |
+
import warnings
|
18 |
+
warnings.filterwarnings("ignore")
|
19 |
+
|
20 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
21 |
+
torch.set_float32_matmul_precision("high")
|
22 |
+
|
23 |
+
def match_array_shapes(array_1:np.ndarray, array_2:np.ndarray):
|
24 |
+
if (len(array_1.shape) == 1) & (len(array_2.shape) == 1):
|
25 |
+
if array_1.shape[0] > array_2.shape[0]:
|
26 |
+
array_1 = array_1[:array_2.shape[0]]
|
27 |
+
elif array_1.shape[0] < array_2.shape[0]:
|
28 |
+
array_1 = np.pad(array_1, ((array_2.shape[0] - array_1.shape[0], 0)), 'constant', constant_values=0)
|
29 |
+
else:
|
30 |
+
if array_1.shape[1] > array_2.shape[1]:
|
31 |
+
array_1 = array_1[:,:array_2.shape[1]]
|
32 |
+
elif array_1.shape[1] < array_2.shape[1]:
|
33 |
+
padding = array_2.shape[1] - array_1.shape[1]
|
34 |
+
array_1 = np.pad(array_1, ((0,0), (0,padding)), 'constant', constant_values=0)
|
35 |
+
return array_1
|
36 |
+
|
37 |
+
|
38 |
+
def lr_filter(audio, cutoff, filter_type, order=12, sr=48000):
|
39 |
+
audio = audio.T
|
40 |
+
nyquist = 0.5 * sr
|
41 |
+
normal_cutoff = cutoff / nyquist
|
42 |
+
b, a = signal.butter(order//2, normal_cutoff, btype=filter_type, analog=False)
|
43 |
+
sos = signal.tf2sos(b, a)
|
44 |
+
filtered_audio = signal.sosfiltfilt(sos, audio)
|
45 |
+
return filtered_audio.T
|
46 |
+
|
47 |
+
class Predictor(BasePredictor):
|
48 |
+
def setup(self, model_name="basic", device="auto"):
|
49 |
+
self.model_name = model_name
|
50 |
+
self.device = device
|
51 |
+
self.sr = 48000
|
52 |
+
print("Loading Model...")
|
53 |
+
self.audiosr = build_model(model_name=self.model_name, device=self.device)
|
54 |
+
# print(self.audiosr)
|
55 |
+
# exit()
|
56 |
+
print("Model loaded!")
|
57 |
+
|
58 |
+
def process_audio(self, input_file, chunk_size=5.12, overlap=0.1, seed=None, guidance_scale=3.5, ddim_steps=50):
|
59 |
+
audio, sr = librosa.load(input_file, sr=input_cutoff * 2, mono=False)
|
60 |
+
audio = audio.T
|
61 |
+
sr = input_cutoff * 2
|
62 |
+
print(f"audio.shape = {audio.shape}")
|
63 |
+
print(f"input cutoff = {input_cutoff}")
|
64 |
+
|
65 |
+
is_stereo = len(audio.shape) == 2
|
66 |
+
audio_channels = [audio] if not is_stereo else [audio[:, 0], audio[:, 1]]
|
67 |
+
print("audio is stereo" if is_stereo else "Audio is mono")
|
68 |
+
|
69 |
+
chunk_samples = int(chunk_size * sr)
|
70 |
+
overlap_samples = int(overlap * chunk_samples)
|
71 |
+
output_chunk_samples = int(chunk_size * self.sr)
|
72 |
+
output_overlap_samples = int(overlap * output_chunk_samples)
|
73 |
+
enable_overlap = overlap > 0
|
74 |
+
print(f"enable_overlap = {enable_overlap}")
|
75 |
+
|
76 |
+
def process_chunks(audio):
|
77 |
+
chunks = []
|
78 |
+
original_lengths = []
|
79 |
+
start = 0
|
80 |
+
while start < len(audio):
|
81 |
+
end = min(start + chunk_samples, len(audio))
|
82 |
+
chunk = audio[start:end]
|
83 |
+
if len(chunk) < chunk_samples:
|
84 |
+
original_lengths.append(len(chunk))
|
85 |
+
chunk = np.concatenate([chunk, np.zeros(chunk_samples - len(chunk))])
|
86 |
+
else:
|
87 |
+
original_lengths.append(chunk_samples)
|
88 |
+
chunks.append(chunk)
|
89 |
+
start += chunk_samples - overlap_samples if enable_overlap else chunk_samples
|
90 |
+
return chunks, original_lengths
|
91 |
+
|
92 |
+
# Process both channels (mono or stereo)
|
93 |
+
chunks_per_channel = [process_chunks(channel) for channel in audio_channels]
|
94 |
+
sample_rate_ratio = self.sr / sr
|
95 |
+
total_length = len(chunks_per_channel[0][0]) * output_chunk_samples - (len(chunks_per_channel[0][0]) - 1) * (output_overlap_samples if enable_overlap else 0)
|
96 |
+
reconstructed_channels = [np.zeros((1, total_length)) for _ in audio_channels]
|
97 |
+
|
98 |
+
meter_before = pyln.Meter(sr)
|
99 |
+
meter_after = pyln.Meter(self.sr)
|
100 |
+
|
101 |
+
# Process chunks for each channel
|
102 |
+
for ch_idx, (chunks, original_lengths) in enumerate(chunks_per_channel):
|
103 |
+
for i, chunk in enumerate(chunks):
|
104 |
+
loudness_before = meter_before.integrated_loudness(chunk)
|
105 |
+
print(f"Processing chunk {i+1} of {len(chunks)} for {'Left/Mono' if ch_idx == 0 else 'Right'} channel")
|
106 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_wav:
|
107 |
+
sf.write(temp_wav.name, chunk, sr)
|
108 |
+
|
109 |
+
out_chunk = super_resolution(
|
110 |
+
self.audiosr,
|
111 |
+
temp_wav.name,
|
112 |
+
seed=seed,
|
113 |
+
guidance_scale=guidance_scale,
|
114 |
+
ddim_steps=ddim_steps,
|
115 |
+
latent_t_per_second=12.8
|
116 |
+
)
|
117 |
+
|
118 |
+
out_chunk = out_chunk[0]
|
119 |
+
num_samples_to_keep = int(original_lengths[i] * sample_rate_ratio)
|
120 |
+
out_chunk = out_chunk[:, :num_samples_to_keep].squeeze()
|
121 |
+
loudness_after = meter_after.integrated_loudness(out_chunk)
|
122 |
+
out_chunk = pyln.normalize.loudness(out_chunk, loudness_after, loudness_before)
|
123 |
+
|
124 |
+
if enable_overlap:
|
125 |
+
actual_overlap_samples = min(output_overlap_samples, num_samples_to_keep)
|
126 |
+
fade_out = np.linspace(1., 0., actual_overlap_samples)
|
127 |
+
fade_in = np.linspace(0., 1., actual_overlap_samples)
|
128 |
+
|
129 |
+
if i == 0:
|
130 |
+
out_chunk[-actual_overlap_samples:] *= fade_out
|
131 |
+
elif i < len(chunks) - 1:
|
132 |
+
out_chunk[:actual_overlap_samples] *= fade_in
|
133 |
+
out_chunk[-actual_overlap_samples:] *= fade_out
|
134 |
+
else:
|
135 |
+
out_chunk[:actual_overlap_samples] *= fade_in
|
136 |
+
|
137 |
+
start = i * (output_chunk_samples - output_overlap_samples if enable_overlap else output_chunk_samples)
|
138 |
+
end = start + out_chunk.shape[0]
|
139 |
+
reconstructed_channels[ch_idx][0, start:end] += out_chunk.flatten()
|
140 |
+
|
141 |
+
reconstructed_audio = np.stack(reconstructed_channels, axis=-1) if is_stereo else reconstructed_channels[0]
|
142 |
+
|
143 |
+
if multiband_ensemble:
|
144 |
+
low, _ = librosa.load(input_file, sr=48000, mono=False)
|
145 |
+
output = match_array_shapes(reconstructed_audio[0].T, low)
|
146 |
+
low = lr_filter(low.T, crossover_freq, 'lowpass', order=10)
|
147 |
+
high = lr_filter(output.T, crossover_freq, 'highpass', order=10)
|
148 |
+
high = lr_filter(high, 23000, 'lowpass', order=2)
|
149 |
+
output = low + high
|
150 |
+
else:
|
151 |
+
output = reconstructed_audio[0]
|
152 |
+
# print(output, type(output))
|
153 |
+
return output
|
154 |
+
|
155 |
+
|
156 |
+
def predict(self,
|
157 |
+
input_file: Path = Input(description="Audio to upsample"),
|
158 |
+
ddim_steps: int = Input(description="Number of inference steps", default=50, ge=10, le=500),
|
159 |
+
guidance_scale: float = Input(description="Scale for classifier free guidance", default=3.5, ge=1.0, le=20.0),
|
160 |
+
overlap: float = Input(description="overlap size", default=0.04),
|
161 |
+
chunk_size: float = Input(description="chunksize", default=10.24),
|
162 |
+
seed: int = Input(description="Random seed. Leave blank to randomize the seed", default=None)
|
163 |
+
) -> Path:
|
164 |
+
|
165 |
+
if seed == 0:
|
166 |
+
seed = random.randint(0, 2**32 - 1)
|
167 |
+
print(f"Setting seed to: {seed}")
|
168 |
+
print(f"overlap = {overlap}")
|
169 |
+
print(f"guidance_scale = {guidance_scale}")
|
170 |
+
print(f"ddim_steps = {ddim_steps}")
|
171 |
+
print(f"chunk_size = {chunk_size}")
|
172 |
+
print(f"multiband_ensemble = {multiband_ensemble}")
|
173 |
+
print(f"input file = {os.path.basename(input_file)}")
|
174 |
+
os.makedirs(output_folder, exist_ok=True)
|
175 |
+
waveform = self.process_audio(
|
176 |
+
input_file,
|
177 |
+
chunk_size=chunk_size,
|
178 |
+
overlap=overlap,
|
179 |
+
seed=seed,
|
180 |
+
guidance_scale=guidance_scale,
|
181 |
+
ddim_steps=ddim_steps
|
182 |
+
)
|
183 |
+
|
184 |
+
filename = os.path.splitext(os.path.basename(input_file))[0]
|
185 |
+
sf.write(f"{output_folder}/SR_{filename}.wav", data=waveform, samplerate=48000, subtype="PCM_16")
|
186 |
+
print(f"file created: {output_folder}/SR_{filename}.wav")
|
187 |
+
del self.audiosr, waveform
|
188 |
+
gc.collect()
|
189 |
+
torch.cuda.empty_cache()
|
190 |
+
|
191 |
+
|
192 |
+
if __name__ == "__main__":
|
193 |
+
|
194 |
+
parser = argparse.ArgumentParser(description="Find volume difference of two audio files.")
|
195 |
+
parser.add_argument("--input", help="Path to input audio file")
|
196 |
+
parser.add_argument("--output", help="Output folder")
|
197 |
+
parser.add_argument("--ddim_steps", help="Number of ddim steps", type=int, required=False, default=50)
|
198 |
+
parser.add_argument("--chunk_size", help="chunk size", type=float, required=False, default=10.24)
|
199 |
+
parser.add_argument("--guidance_scale", help="Guidance scale value", type=float, required=False, default=3.5)
|
200 |
+
parser.add_argument("--seed", help="Seed value, 0 = random seed", type=int, required=False, default=0)
|
201 |
+
parser.add_argument("--overlap", help="overlap value", type=float, required=False, default=0.04)
|
202 |
+
parser.add_argument("--multiband_ensemble", type=bool, help="Use multiband ensemble with input")
|
203 |
+
parser.add_argument("--input_cutoff", help="Define the crossover of audio input in the multiband ensemble", type=int, required=False, default=12000)
|
204 |
+
|
205 |
+
args = parser.parse_args()
|
206 |
+
|
207 |
+
input_file_path = args.input
|
208 |
+
output_folder = args.output
|
209 |
+
ddim_steps = args.ddim_steps
|
210 |
+
chunk_size = args.chunk_size
|
211 |
+
guidance_scale = args.guidance_scale
|
212 |
+
seed = args.seed
|
213 |
+
overlap = args.overlap
|
214 |
+
input_cutoff = args.input_cutoff
|
215 |
+
multiband_ensemble = args.multiband_ensemble
|
216 |
+
|
217 |
+
crossover_freq = input_cutoff - 1000
|
218 |
+
|
219 |
+
p = Predictor()
|
220 |
+
|
221 |
+
p.setup(device='auto')
|
222 |
+
|
223 |
+
|
224 |
+
out = p.predict(
|
225 |
+
input_file_path,
|
226 |
+
ddim_steps=ddim_steps,
|
227 |
+
guidance_scale=guidance_scale,
|
228 |
+
seed=seed,
|
229 |
+
chunk_size=chunk_size,
|
230 |
+
overlap=overlap
|
231 |
+
)
|
232 |
+
|
233 |
+
del p
|
234 |
+
gc.collect()
|
235 |
+
torch.cuda.empty_cache()
|
predict.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import soundfile as sf
|
6 |
+
import torch
|
7 |
+
from cog import BasePredictor, Input, Path
|
8 |
+
|
9 |
+
from audiosr import build_model, super_resolution
|
10 |
+
|
11 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
12 |
+
torch.set_float32_matmul_precision("high")
|
13 |
+
|
14 |
+
class Predictor(BasePredictor):
|
15 |
+
def setup(self, model_name="basic", device="auto"):
|
16 |
+
self.model_name = model_name
|
17 |
+
self.device = device
|
18 |
+
self.sr = 48000
|
19 |
+
self.audiosr = build_model(model_name=self.model_name, device=self.device)
|
20 |
+
|
21 |
+
def predict(self,
|
22 |
+
input_file: Path = Input(description="Audio to upsample"),
|
23 |
+
ddim_steps: int = Input(description="Number of inference steps", default=50, ge=10, le=500),
|
24 |
+
guidance_scale: float = Input(description="Scale for classifier free guidance", default=3.5, ge=1.0, le=20.0),
|
25 |
+
seed: int = Input(description="Random seed. Leave blank to randomize the seed", default=None)
|
26 |
+
) -> Path:
|
27 |
+
"""Run a single prediction on the model"""
|
28 |
+
if seed is None:
|
29 |
+
seed = random.randint(0, 2**32 - 1)
|
30 |
+
print(f"Setting seed to: {seed}")
|
31 |
+
|
32 |
+
waveform = super_resolution(
|
33 |
+
self.audiosr,
|
34 |
+
input_file,
|
35 |
+
seed=seed,
|
36 |
+
guidance_scale=guidance_scale,
|
37 |
+
ddim_steps=ddim_steps,
|
38 |
+
latent_t_per_second=12.8
|
39 |
+
)
|
40 |
+
out_wav = (waveform[0] * 32767).astype(np.int16).T
|
41 |
+
sf.write("out.wav", data=out_wav, samplerate=48000)
|
42 |
+
return Path("out.wav")
|
43 |
+
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
p = Predictor()
|
47 |
+
p.setup()
|
48 |
+
out = p.predict(
|
49 |
+
"example/music.wav",
|
50 |
+
ddim_steps=50,
|
51 |
+
guidance_scale=3.5,
|
52 |
+
seed=42
|
53 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--extra-index-url https://download.pytorch.org/whl/cu118
|
2 |
+
git+https://github.com/huggingface/diffusers.git
|
3 |
+
torch==2.0.1+cu118; sys_platform != 'darwin'
|
4 |
+
torch==2.0.1; sys_platform == 'darwin'
|
5 |
+
torchvision==0.15.2+cu118; sys_platform != 'darwin'
|
6 |
+
torchvision==0.15.2; sys_platform == 'darwin'
|
7 |
+
torchaudio==2.0.2+cu118; sys_platform != 'darwin'
|
8 |
+
torchaudio==2.0.2; sys_platform == 'darwin'
|
9 |
+
huggingface_hub
|
10 |
+
transformers==4.30.2
|
11 |
+
gradio
|
12 |
+
soundfile
|
13 |
+
progressbar
|
14 |
+
librosa
|
15 |
+
audiosr
|
16 |
+
unidecode
|
setup.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# -*- encoding: utf-8 -*-
|
3 |
+
# python3 setup.py sdist bdist_wheel
|
4 |
+
"""
|
5 |
+
@File : setup.py.py
|
6 |
+
@Contact : [email protected]
|
7 |
+
@License : (C)Copyright 2020-2100
|
8 |
+
|
9 |
+
@Modify Time @Author @Version @Desciption
|
10 |
+
------------ ------- -------- -----------
|
11 |
+
9/6/21 5:16 PM Haohe Liu 1.0 None
|
12 |
+
"""
|
13 |
+
|
14 |
+
# !/usr/bin/env python
|
15 |
+
# -*- coding: utf-8 -*-
|
16 |
+
|
17 |
+
# Note: To use the 'upload' functionality of this file, you must:
|
18 |
+
# $ pipenv install twine --dev
|
19 |
+
|
20 |
+
import io
|
21 |
+
import os
|
22 |
+
import sys
|
23 |
+
from shutil import rmtree
|
24 |
+
|
25 |
+
from setuptools import find_packages, setup, Command
|
26 |
+
|
27 |
+
# Package meta-data.
|
28 |
+
NAME = "audiosr"
|
29 |
+
DESCRIPTION = "This package is written for text-to-audio/music generation."
|
30 |
+
URL = "https://github.com/haoheliu/audiosr"
|
31 |
+
EMAIL = "[email protected]"
|
32 |
+
AUTHOR = "Haohe Liu"
|
33 |
+
REQUIRES_PYTHON = ">=3.7.0"
|
34 |
+
VERSION = "0.0.7"
|
35 |
+
|
36 |
+
# What packages are required for this module to be executed?
|
37 |
+
REQUIRED = [
|
38 |
+
"torch>=1.13.0",
|
39 |
+
"torchaudio>=0.13.0",
|
40 |
+
"torchvision>=0.14.0",
|
41 |
+
"tqdm",
|
42 |
+
"gradio",
|
43 |
+
"pyyaml",
|
44 |
+
"einops",
|
45 |
+
"chardet",
|
46 |
+
"numpy<=1.23.5",
|
47 |
+
"soundfile",
|
48 |
+
"librosa==0.9.2",
|
49 |
+
"scipy",
|
50 |
+
"pandas",
|
51 |
+
"unidecode",
|
52 |
+
"phonemizer",
|
53 |
+
"torchlibrosa>=0.0.9",
|
54 |
+
"transformers==4.30.2",
|
55 |
+
"huggingface_hub",
|
56 |
+
"progressbar",
|
57 |
+
"ftfy",
|
58 |
+
"timm",
|
59 |
+
]
|
60 |
+
|
61 |
+
# What packages are optional?
|
62 |
+
EXTRAS = {}
|
63 |
+
|
64 |
+
# The rest you shouldn't have to touch too much :)
|
65 |
+
# ------------------------------------------------
|
66 |
+
# Except, perhaps the License and Trove Classifiers!
|
67 |
+
# If you do change the License, remember to change the Trove Classifier for that!
|
68 |
+
|
69 |
+
here = os.path.abspath(os.path.dirname(__file__))
|
70 |
+
|
71 |
+
# Import the README and use it as the long-description.
|
72 |
+
# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
|
73 |
+
try:
|
74 |
+
with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
|
75 |
+
long_description = "\n" + f.read()
|
76 |
+
except FileNotFoundError:
|
77 |
+
long_description = DESCRIPTION
|
78 |
+
|
79 |
+
# Load the package's __version__.py module as a dictionary.
|
80 |
+
about = {}
|
81 |
+
if not VERSION:
|
82 |
+
project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
|
83 |
+
with open(os.path.join(here, project_slug, "__version__.py")) as f:
|
84 |
+
exec(f.read(), about)
|
85 |
+
else:
|
86 |
+
about["__version__"] = VERSION
|
87 |
+
|
88 |
+
|
89 |
+
class UploadCommand(Command):
|
90 |
+
"""Support setup.py upload."""
|
91 |
+
|
92 |
+
description = "Build and publish the package."
|
93 |
+
user_options = []
|
94 |
+
|
95 |
+
@staticmethod
|
96 |
+
def status(s):
|
97 |
+
"""Prints things in bold."""
|
98 |
+
print("\033[1m{0}\033[0m".format(s))
|
99 |
+
|
100 |
+
def initialize_options(self):
|
101 |
+
pass
|
102 |
+
|
103 |
+
def finalize_options(self):
|
104 |
+
pass
|
105 |
+
|
106 |
+
def run(self):
|
107 |
+
try:
|
108 |
+
self.status("Removing previous builds…")
|
109 |
+
rmtree(os.path.join(here, "dist"))
|
110 |
+
except OSError:
|
111 |
+
pass
|
112 |
+
|
113 |
+
self.status("Building Source and Wheel (universal) distribution…")
|
114 |
+
os.system("{0} setup.py sdist bdist_wheel --universal".format(sys.executable))
|
115 |
+
|
116 |
+
self.status("Uploading the package to PyPI via Twine…")
|
117 |
+
os.system("twine upload dist/*")
|
118 |
+
|
119 |
+
self.status("Pushing git tags…")
|
120 |
+
os.system("git tag v{0}".format(about["__version__"]))
|
121 |
+
os.system("git push --tags")
|
122 |
+
|
123 |
+
sys.exit()
|
124 |
+
|
125 |
+
|
126 |
+
# Where the magic happens:
|
127 |
+
setup(
|
128 |
+
name=NAME,
|
129 |
+
version=about["__version__"],
|
130 |
+
description=DESCRIPTION,
|
131 |
+
long_description=long_description,
|
132 |
+
long_description_content_type="text/markdown",
|
133 |
+
author=AUTHOR,
|
134 |
+
author_email=EMAIL,
|
135 |
+
python_requires=REQUIRES_PYTHON,
|
136 |
+
url=URL,
|
137 |
+
# packages=find_packages(exclude=[]),
|
138 |
+
# If your package is a single module, use this instead of 'packages':
|
139 |
+
# entry_points={
|
140 |
+
# 'console_scripts': ['mycli=mymodule:cli'],
|
141 |
+
# },
|
142 |
+
install_requires=REQUIRED,
|
143 |
+
extras_require=EXTRAS,
|
144 |
+
packages=find_packages(),
|
145 |
+
include_package_data=True,
|
146 |
+
license="MIT",
|
147 |
+
classifiers=[
|
148 |
+
# Trove classifiers
|
149 |
+
# Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
|
150 |
+
"License :: OSI Approved :: MIT License",
|
151 |
+
"Programming Language :: Python",
|
152 |
+
"Programming Language :: Python :: 3",
|
153 |
+
"Programming Language :: Python :: 3.7",
|
154 |
+
"Programming Language :: Python :: Implementation :: CPython",
|
155 |
+
"Programming Language :: Python :: Implementation :: PyPy",
|
156 |
+
],
|
157 |
+
# $ setup.py publish support.
|
158 |
+
cmdclass={
|
159 |
+
"upload": UploadCommand,
|
160 |
+
},
|
161 |
+
scripts=["bin/audiosr.cmd", "bin/audiosr"],
|
162 |
+
)
|
visualization.png
ADDED
![]() |