Spaces:

Artificial-superintelligence
/

Algorithmvoice

Running

File size: 5,363 Bytes

import streamlit as st
import torch
import torchaudio
import numpy as np
import librosa
import soundfile as sf
from TTS.api import TTS
from fairseq import checkpoint_utils
import wget
import os
from io import BytesIO
import tempfile
import huggingface_hub

class VoiceConverter:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.load_models()

    def load_models(self):
        # Download pre-trained models if not exists
        models_dir = "pretrained_models"
        os.makedirs(models_dir, exist_ok=True)

        # Load Coqui TTS model
        self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)

        # Load VITS model
        vits_path = os.path.join(models_dir, "vits_female.pth")
        if not os.path.exists(vits_path):
            # Download VITS pre-trained model
            wget.download(
                "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
                vits_path
            )
        
        self.vits_model = torch.load(vits_path, map_location=self.device)
        self.vits_model.eval()

    def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
        # Load audio
        wav, sr = librosa.load(audio_path)
        
        # Resample if needed
        if sr != 22050:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
            sr = 22050

        # Convert to tensor
        wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)

        # Process with VITS
        with torch.no_grad():
            converted = self.vits_model.voice_conversion(
                wav_tensor,
                speaker_id=speaker_id
            )

        # Process with Coqui TTS for emotion
        wav_path = "temp.wav"
        sf.write(wav_path, converted.cpu().numpy(), sr)
        
        emotional_wav = self.tts.tts_with_vc(
            wav_path,
            speaker_wav=wav_path,
            emotion=emotion
        )

        return emotional_wav, sr

def save_audio(audio_data, sr):
    buffer = BytesIO()
    sf.write(buffer, audio_data, sr, format='WAV')
    return buffer

# Streamlit Interface
st.title("AI Voice Converter - Female Voice Transformation")

# Model selection
model_type = st.selectbox(
    "Select Voice Model",
    ["VITS Female", "YourTTS Female", "Mixed Model"]
)

# Voice character selection
voice_character = st.selectbox(
    "Select Voice Character",
    ["Anime Female", "Natural Female", "Young Female", "Mature Female"]
)

# Emotion selection
emotion = st.selectbox(
    "Select Emotion",
    ["Happy", "Sad", "Angry", "Neutral", "Excited"]
)

# Additional parameters
with st.expander("Advanced Settings"):
    pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
    clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
    speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)

# File upload
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])

if uploaded_file is not None:
    # Initialize converter
    converter = VoiceConverter()

    # Save uploaded file temporarily
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
        tmp_file.write(uploaded_file.getvalue())
        tmp_path = tmp_file.name

    if st.button("Convert Voice"):
        try:
            with st.spinner("Converting voice... This may take a few moments."):
                # Get speaker ID based on voice character
                speaker_id = {
                    "Anime Female": 0,
                    "Natural Female": 1,
                    "Young Female": 2,
                    "Mature Female": 3
                }[voice_character]

                # Convert voice
                converted_audio, sr = converter.convert_voice(
                    tmp_path,
                    speaker_id=speaker_id,
                    emotion=emotion
                )

                # Create audio buffer
                audio_buffer = save_audio(converted_audio, sr)

                # Display audio player
                st.audio(audio_buffer, format='audio/wav')

                # Download button
                st.download_button(
                    label="Download Converted Audio",
                    data=audio_buffer,
                    file_name="ai_converted_voice.wav",
                    mime="audio/wav"
                )

        except Exception as e:
            st.error(f"Error during conversion: {str(e)}")

# Add information about the models
st.markdown("""
### Model Information:
1. **VITS Female**: Pre-trained on a large dataset of female voices
2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
3. **Mixed Model**: Combination of multiple models for better quality

### Voice Characters:
- **Anime Female**: High-pitched, animated style voice
- **Natural Female**: Realistic female voice
- **Young Female**: Young adult female voice
- **Mature Female**: Mature female voice

### Tips for Best Results:
- Use clear audio input with minimal background noise
- Short audio clips (5-30 seconds) work best
- Experiment with different emotions and voice characters
- Adjust advanced settings for fine-tuning
""")

# Requirements
"""
pip install requirements:
TTS
fairseq
torch
torchaudio
streamlit
librosa
soundfile
numpy
wget
huggingface_hub
"""