File size: 5,363 Bytes
aaed37a
8d24163
 
2158d6f
 
 
 
 
 
 
aaed37a
 
2158d6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaed37a
2158d6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaed37a
 
 
2158d6f
 
 
aaed37a
 
 
 
 
8d24163
2158d6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d24163
2158d6f
 
 
 
8d24163
2158d6f
 
8d24163
 
 
2158d6f
 
8d24163
 
aaed37a
2158d6f
 
 
 
aaed37a
2158d6f
 
 
 
 
 
 
 
 
 
aaed37a
8d24163
2158d6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import streamlit as st
import torch
import torchaudio
import numpy as np
import librosa
import soundfile as sf
from TTS.api import TTS
from fairseq import checkpoint_utils
import wget
import os
from io import BytesIO
import tempfile
import huggingface_hub

class VoiceConverter:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.load_models()

    def load_models(self):
        # Download pre-trained models if not exists
        models_dir = "pretrained_models"
        os.makedirs(models_dir, exist_ok=True)

        # Load Coqui TTS model
        self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)

        # Load VITS model
        vits_path = os.path.join(models_dir, "vits_female.pth")
        if not os.path.exists(vits_path):
            # Download VITS pre-trained model
            wget.download(
                "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
                vits_path
            )
        
        self.vits_model = torch.load(vits_path, map_location=self.device)
        self.vits_model.eval()

    def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
        # Load audio
        wav, sr = librosa.load(audio_path)
        
        # Resample if needed
        if sr != 22050:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
            sr = 22050

        # Convert to tensor
        wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)

        # Process with VITS
        with torch.no_grad():
            converted = self.vits_model.voice_conversion(
                wav_tensor,
                speaker_id=speaker_id
            )

        # Process with Coqui TTS for emotion
        wav_path = "temp.wav"
        sf.write(wav_path, converted.cpu().numpy(), sr)
        
        emotional_wav = self.tts.tts_with_vc(
            wav_path,
            speaker_wav=wav_path,
            emotion=emotion
        )

        return emotional_wav, sr

def save_audio(audio_data, sr):
    buffer = BytesIO()
    sf.write(buffer, audio_data, sr, format='WAV')
    return buffer

# Streamlit Interface
st.title("AI Voice Converter - Female Voice Transformation")

# Model selection
model_type = st.selectbox(
    "Select Voice Model",
    ["VITS Female", "YourTTS Female", "Mixed Model"]
)

# Voice character selection
voice_character = st.selectbox(
    "Select Voice Character",
    ["Anime Female", "Natural Female", "Young Female", "Mature Female"]
)

# Emotion selection
emotion = st.selectbox(
    "Select Emotion",
    ["Happy", "Sad", "Angry", "Neutral", "Excited"]
)

# Additional parameters
with st.expander("Advanced Settings"):
    pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
    clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
    speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)

# File upload
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])

if uploaded_file is not None:
    # Initialize converter
    converter = VoiceConverter()

    # Save uploaded file temporarily
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
        tmp_file.write(uploaded_file.getvalue())
        tmp_path = tmp_file.name

    if st.button("Convert Voice"):
        try:
            with st.spinner("Converting voice... This may take a few moments."):
                # Get speaker ID based on voice character
                speaker_id = {
                    "Anime Female": 0,
                    "Natural Female": 1,
                    "Young Female": 2,
                    "Mature Female": 3
                }[voice_character]

                # Convert voice
                converted_audio, sr = converter.convert_voice(
                    tmp_path,
                    speaker_id=speaker_id,
                    emotion=emotion
                )

                # Create audio buffer
                audio_buffer = save_audio(converted_audio, sr)

                # Display audio player
                st.audio(audio_buffer, format='audio/wav')

                # Download button
                st.download_button(
                    label="Download Converted Audio",
                    data=audio_buffer,
                    file_name="ai_converted_voice.wav",
                    mime="audio/wav"
                )

        except Exception as e:
            st.error(f"Error during conversion: {str(e)}")

# Add information about the models
st.markdown("""
### Model Information:
1. **VITS Female**: Pre-trained on a large dataset of female voices
2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
3. **Mixed Model**: Combination of multiple models for better quality

### Voice Characters:
- **Anime Female**: High-pitched, animated style voice
- **Natural Female**: Realistic female voice
- **Young Female**: Young adult female voice
- **Mature Female**: Mature female voice

### Tips for Best Results:
- Use clear audio input with minimal background noise
- Short audio clips (5-30 seconds) work best
- Experiment with different emotions and voice characters
- Adjust advanced settings for fine-tuning
""")

# Requirements
"""
pip install requirements:
TTS
fairseq
torch
torchaudio
streamlit
librosa
soundfile
numpy
wget
huggingface_hub
"""