import streamlit as st import torch import torchaudio import numpy as np import librosa import soundfile as sf from TTS.api import TTS from fairseq import checkpoint_utils import wget import os from io import BytesIO import tempfile import huggingface_hub class VoiceConverter: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.load_models() def load_models(self): # Download pre-trained models if not exists models_dir = "pretrained_models" os.makedirs(models_dir, exist_ok=True) # Load Coqui TTS model self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False) # Load VITS model vits_path = os.path.join(models_dir, "vits_female.pth") if not os.path.exists(vits_path): # Download VITS pre-trained model wget.download( "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth", vits_path ) self.vits_model = torch.load(vits_path, map_location=self.device) self.vits_model.eval() def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"): # Load audio wav, sr = librosa.load(audio_path) # Resample if needed if sr != 22050: wav = librosa.resample(wav, orig_sr=sr, target_sr=22050) sr = 22050 # Convert to tensor wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device) # Process with VITS with torch.no_grad(): converted = self.vits_model.voice_conversion( wav_tensor, speaker_id=speaker_id ) # Process with Coqui TTS for emotion wav_path = "temp.wav" sf.write(wav_path, converted.cpu().numpy(), sr) emotional_wav = self.tts.tts_with_vc( wav_path, speaker_wav=wav_path, emotion=emotion ) return emotional_wav, sr def save_audio(audio_data, sr): buffer = BytesIO() sf.write(buffer, audio_data, sr, format='WAV') return buffer # Streamlit Interface st.title("AI Voice Converter - Female Voice Transformation") # Model selection model_type = st.selectbox( "Select Voice Model", ["VITS Female", "YourTTS Female", "Mixed Model"] ) # Voice character selection voice_character = st.selectbox( "Select Voice Character", ["Anime Female", "Natural Female", "Young Female", "Mature Female"] ) # Emotion selection emotion = st.selectbox( "Select Emotion", ["Happy", "Sad", "Angry", "Neutral", "Excited"] ) # Additional parameters with st.expander("Advanced Settings"): pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0) clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8) speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0) # File upload uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3']) if uploaded_file is not None: # Initialize converter converter = VoiceConverter() # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_path = tmp_file.name if st.button("Convert Voice"): try: with st.spinner("Converting voice... This may take a few moments."): # Get speaker ID based on voice character speaker_id = { "Anime Female": 0, "Natural Female": 1, "Young Female": 2, "Mature Female": 3 }[voice_character] # Convert voice converted_audio, sr = converter.convert_voice( tmp_path, speaker_id=speaker_id, emotion=emotion ) # Create audio buffer audio_buffer = save_audio(converted_audio, sr) # Display audio player st.audio(audio_buffer, format='audio/wav') # Download button st.download_button( label="Download Converted Audio", data=audio_buffer, file_name="ai_converted_voice.wav", mime="audio/wav" ) except Exception as e: st.error(f"Error during conversion: {str(e)}") # Add information about the models st.markdown(""" ### Model Information: 1. **VITS Female**: Pre-trained on a large dataset of female voices 2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model 3. **Mixed Model**: Combination of multiple models for better quality ### Voice Characters: - **Anime Female**: High-pitched, animated style voice - **Natural Female**: Realistic female voice - **Young Female**: Young adult female voice - **Mature Female**: Mature female voice ### Tips for Best Results: - Use clear audio input with minimal background noise - Short audio clips (5-30 seconds) work best - Experiment with different emotions and voice characters - Adjust advanced settings for fine-tuning """) # Requirements """ pip install requirements: TTS fairseq torch torchaudio streamlit librosa soundfile numpy wget huggingface_hub """