File size: 5,363 Bytes
aaed37a 8d24163 2158d6f aaed37a 2158d6f aaed37a 2158d6f aaed37a 2158d6f aaed37a 8d24163 2158d6f 8d24163 2158d6f 8d24163 2158d6f 8d24163 2158d6f 8d24163 aaed37a 2158d6f aaed37a 2158d6f aaed37a 8d24163 2158d6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import streamlit as st
import torch
import torchaudio
import numpy as np
import librosa
import soundfile as sf
from TTS.api import TTS
from fairseq import checkpoint_utils
import wget
import os
from io import BytesIO
import tempfile
import huggingface_hub
class VoiceConverter:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.load_models()
def load_models(self):
# Download pre-trained models if not exists
models_dir = "pretrained_models"
os.makedirs(models_dir, exist_ok=True)
# Load Coqui TTS model
self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
# Load VITS model
vits_path = os.path.join(models_dir, "vits_female.pth")
if not os.path.exists(vits_path):
# Download VITS pre-trained model
wget.download(
"https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
vits_path
)
self.vits_model = torch.load(vits_path, map_location=self.device)
self.vits_model.eval()
def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
# Load audio
wav, sr = librosa.load(audio_path)
# Resample if needed
if sr != 22050:
wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
sr = 22050
# Convert to tensor
wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)
# Process with VITS
with torch.no_grad():
converted = self.vits_model.voice_conversion(
wav_tensor,
speaker_id=speaker_id
)
# Process with Coqui TTS for emotion
wav_path = "temp.wav"
sf.write(wav_path, converted.cpu().numpy(), sr)
emotional_wav = self.tts.tts_with_vc(
wav_path,
speaker_wav=wav_path,
emotion=emotion
)
return emotional_wav, sr
def save_audio(audio_data, sr):
buffer = BytesIO()
sf.write(buffer, audio_data, sr, format='WAV')
return buffer
# Streamlit Interface
st.title("AI Voice Converter - Female Voice Transformation")
# Model selection
model_type = st.selectbox(
"Select Voice Model",
["VITS Female", "YourTTS Female", "Mixed Model"]
)
# Voice character selection
voice_character = st.selectbox(
"Select Voice Character",
["Anime Female", "Natural Female", "Young Female", "Mature Female"]
)
# Emotion selection
emotion = st.selectbox(
"Select Emotion",
["Happy", "Sad", "Angry", "Neutral", "Excited"]
)
# Additional parameters
with st.expander("Advanced Settings"):
pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)
# File upload
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
if uploaded_file is not None:
# Initialize converter
converter = VoiceConverter()
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_path = tmp_file.name
if st.button("Convert Voice"):
try:
with st.spinner("Converting voice... This may take a few moments."):
# Get speaker ID based on voice character
speaker_id = {
"Anime Female": 0,
"Natural Female": 1,
"Young Female": 2,
"Mature Female": 3
}[voice_character]
# Convert voice
converted_audio, sr = converter.convert_voice(
tmp_path,
speaker_id=speaker_id,
emotion=emotion
)
# Create audio buffer
audio_buffer = save_audio(converted_audio, sr)
# Display audio player
st.audio(audio_buffer, format='audio/wav')
# Download button
st.download_button(
label="Download Converted Audio",
data=audio_buffer,
file_name="ai_converted_voice.wav",
mime="audio/wav"
)
except Exception as e:
st.error(f"Error during conversion: {str(e)}")
# Add information about the models
st.markdown("""
### Model Information:
1. **VITS Female**: Pre-trained on a large dataset of female voices
2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
3. **Mixed Model**: Combination of multiple models for better quality
### Voice Characters:
- **Anime Female**: High-pitched, animated style voice
- **Natural Female**: Realistic female voice
- **Young Female**: Young adult female voice
- **Mature Female**: Mature female voice
### Tips for Best Results:
- Use clear audio input with minimal background noise
- Short audio clips (5-30 seconds) work best
- Experiment with different emotions and voice characters
- Adjust advanced settings for fine-tuning
""")
# Requirements
"""
pip install requirements:
TTS
fairseq
torch
torchaudio
streamlit
librosa
soundfile
numpy
wget
huggingface_hub
""" |