Spaces:

Artificial-superintelligence
/

Algorithmvoice

Running

App Files Files Community

Artificial-superintelligence commited on Oct 17, 2024

Commit

20d8ce9

verified ·

1 Parent(s): 375b8bc

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -113

app.py CHANGED Viewed

@@ -3,129 +3,123 @@ import librosa
 import soundfile as sf
 import numpy as np
 import scipy.signal as signal
-from scipy.io import wavfile
 from io import BytesIO
 import tempfile
-def modify_formants(y, sr, formant_shift_factor=1.2):
-    # Get the power spectrum
-    D = librosa.stft(y)
-    S = np.abs(D)
-    # Use frame-based processing for LPC
-    frame_length = 2048
-    hop_length = 512
-    frames = librosa.util.frame(y, frame_length=frame_length, hop_length=hop_length)
-    # Process each frame
-    modified_frames = []
-    for frame in frames.T:
-        # Calculate LPC coefficients
-        a = librosa.lpc(frame, order=12)
-        # Shift formants
-        new_a = np.zeros_like(a)
-        new_a[0] = a[0]
-        for i in range(1, len(a)):
-            new_a[i] = a[i] * (formant_shift_factor ** i)
-        # Apply modified LPC filter
-        modified_frame = signal.lfilter([1], new_a, frame)
-        modified_frames.append(modified_frame)
-    # Reconstruct the signal
-    y_formant = np.concatenate([frame[:hop_length] for frame in modified_frames[:-1]] +
-                              [modified_frames[-1]])
-    return librosa.util.normalize(y_formant)
-def enhance_harmonics(y, sr):
-    # Extract harmonics using harmonic-percussive source separation
-    y_harmonic = librosa.effects.hpss(y)[0]
-    # Enhance the harmonics
-    y_enhanced = y_harmonic * 1.2 + y * 0.3
-    return librosa.util.normalize(y_enhanced)
 def process_audio_advanced(audio_file, settings):
-    # Load audio
-    y, sr = librosa.load(audio_file)
-    # Pitch shifting with formant preservation
-    y_shifted = librosa.effects.pitch_shift(
-        y,
-        sr=sr,
-        n_steps=settings['pitch_shift']
-    )
-    # Modify formants
-    y_formant = modify_formants(
-        y_shifted,
         sr,
-        settings['formant_shift']
     )
-    # Enhance harmonics
-    y_harmonic = enhance_harmonics(y_formant, sr)
-    # Apply vocal tract length modification through resampling
-    y_vtln = librosa.effects.time_stretch(
-        y_harmonic,
-        rate=settings['vtln_factor']
-    )
-    # Smooth the output
-    y_smooth = signal.savgol_filter(y_vtln, 1001, 2)
-    # Final normalization
-    y_final = librosa.util.normalize(y_smooth)
     return y_final, sr
 def create_voice_preset(preset_name):
     presets = {
         'Young Female': {
-            'pitch_shift': 8.0,
-            'formant_shift': 1.3,
-            'vtln_factor': 1.1,
-            'breathiness': 0.3
         },
         'Mature Female': {
-            'pitch_shift': 6.0,
-            'formant_shift': 1.2,
-            'vtln_factor': 1.05,
-            'breathiness': 0.2
         },
         'Soft Female': {
-            'pitch_shift': 7.0,
-            'formant_shift': 1.25,
-            'vtln_factor': 1.15,
-            'breathiness': 0.4
         }
     }
     return presets.get(preset_name)
-def add_breathiness(y, sr, amount=0.3):
-    # Generate breath noise
-    noise = np.random.normal(0, 0.01, len(y))
-    noise_filtered = signal.lfilter([1], [1, -0.98], noise)
-    # Mix with original signal
-    y_breathy = y * (1 - amount) + noise_filtered * amount
-    return librosa.util.normalize(y_breathy)
-st.title("Advanced Female Voice Converter")
-# File uploader
 uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
 if uploaded_file is not None:
-    # Save uploaded file temporarily
     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
         tmp_file.write(uploaded_file.getvalue())
         tmp_path = tmp_file.name
-    # Voice preset selector
     preset_name = st.selectbox(
         "Select Voice Preset",
         ['Young Female', 'Mature Female', 'Soft Female', 'Custom']
@@ -133,10 +127,9 @@ if uploaded_file is not None:
     if preset_name == 'Custom':
         settings = {
-            'pitch_shift': st.slider("Pitch Shift", 0.0, 12.0, 8.0, 0.5),
-            'formant_shift': st.slider("Formant Shift", 1.0, 1.5, 1.2, 0.05),
-            'vtln_factor': st.slider("Vocal Tract Length", 0.9, 1.2, 1.1, 0.05),
-            'breathiness': st.slider("Breathiness", 0.0, 1.0, 0.3, 0.1)
         }
     else:
         settings = create_voice_preset(preset_name)
@@ -144,16 +137,8 @@ if uploaded_file is not None:
     if st.button("Convert Voice"):
         with st.spinner("Processing audio..."):
             try:
-                # Process audio
                 processed_audio, sr = process_audio_advanced(tmp_path, settings)
-                # Add breathiness
-                processed_audio = add_breathiness(
-                    processed_audio,
-                    sr,
-                    settings['breathiness']
-                )
                 # Save to buffer
                 buffer = BytesIO()
                 sf.write(buffer, processed_audio, sr, format='WAV')
@@ -173,20 +158,15 @@ if uploaded_file is not None:
                 st.error(f"Error processing audio: {str(e)}")
 st.markdown("""
-### Voice Conversion Features:
-- Pitch shifting with formant preservation
-- Harmonic enhancement
-- Vocal tract length modification
-- Natural breathiness addition
-- Multiple voice presets
-- Custom parameter controls
 ### Tips for Best Results:
-1. Start with a clear audio recording
-2. Try different presets to find the best match
-3. For custom settings:
-   - Pitch shift: 6-8 for natural female voice
-   - Formant shift: 1.1-1.3 for feminine resonance
-   - Vocal tract length: 1.05-1.15 for realistic results
-   - Breathiness: 0.2-0.4 for natural sound
 """)

 import soundfile as sf
 import numpy as np
 import scipy.signal as signal
 from io import BytesIO
 import tempfile
+def pitch_shift_with_formant_preservation(y, sr, n_steps):
+    # Use a smaller frame size for better quality
+    frame_length = 1024
+    hop_length = 256
+    # Apply pitch shifting with smaller frame size
+    y_shifted = librosa.effects.pitch_shift(
+        y=y,
+        sr=sr,
+        n_steps=n_steps,
+        bins_per_octave=12,
+        res_type='kaiser_fast'
+    )
+    return y_shifted
+def enhance_female_characteristics(y, sr, settings):
+    # Extract harmonics more gently
+    y_harmonic, y_percussive = librosa.effects.hpss(
+        y,
+        margin=3.0,
+        kernel_size=31
+    )
+    # Enhance harmonics subtly
+    y_enhanced = y_harmonic * settings['harmonic_boost'] + y * (1 - settings['harmonic_boost'])
+    # Apply subtle EQ to enhance female characteristics
+    y_filtered = apply_female_eq(y_enhanced, sr)
+    return y_filtered
+def apply_female_eq(y, sr):
+    # Design filters for female voice enhancement
+    # Boost frequencies around 1kHz-2kHz for feminine resonance
+    b1, a1 = signal.butter(2, [1000/(sr/2), 2000/(sr/2)], btype='band')
+    y_filtered = signal.filtfilt(b1, a1, y)
+    # Slight boost in high frequencies (3kHz-5kHz)
+    b2, a2 = signal.butter(2, [3000/(sr/2), 5000/(sr/2)], btype='band')
+    y_filtered += 0.3 * signal.filtfilt(b2, a2, y)
+    return librosa.util.normalize(y_filtered)
+def add_breathiness(y, sr, amount):
+    # Generate more natural breath noise
+    noise = np.random.normal(0, 0.005, len(y))
+    # Filter the noise to sound more like breath
+    b, a = signal.butter(2, 2000/(sr/2), btype='lowpass')
+    breath_noise = signal.filtfilt(b, a, noise)
+    # Add filtered noise
+    y_breathy = y * (1 - amount) + breath_noise * amount
+    return librosa.util.normalize(y_breathy)
 def process_audio_advanced(audio_file, settings):
+    # Load audio with a higher sample rate
+    y, sr = librosa.load(audio_file, sr=24000)
+    # Remove DC offset
+    y = librosa.util.normalize(y - np.mean(y))
+    # Apply pitch shifting
+    y_shifted = pitch_shift_with_formant_preservation(
+        y,
         sr,
+        settings['pitch_shift']
     )
+    # Enhance female characteristics
+    y_enhanced = enhance_female_characteristics(y_shifted, sr, settings)
+    # Add breathiness
+    if settings['breathiness'] > 0:
+        y_enhanced = add_breathiness(y_enhanced, sr, settings['breathiness'])
+    # Final normalization and cleaning
+    y_final = librosa.util.normalize(y_enhanced)
+    # Apply final smoothing
+    y_final = signal.savgol_filter(y_final, 1001, 2)
     return y_final, sr
 def create_voice_preset(preset_name):
     presets = {
         'Young Female': {
+            'pitch_shift': 4.0,
+            'harmonic_boost': 0.3,
+            'breathiness': 0.15
         },
         'Mature Female': {
+            'pitch_shift': 3.0,
+            'harmonic_boost': 0.2,
+            'breathiness': 0.1
         },
         'Soft Female': {
+            'pitch_shift': 3.5,
+            'harmonic_boost': 0.25,
+            'breathiness': 0.2
         }
     }
     return presets.get(preset_name)
+st.title("Improved Female Voice Converter")
 uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
 if uploaded_file is not None:
     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
         tmp_file.write(uploaded_file.getvalue())
         tmp_path = tmp_file.name
     preset_name = st.selectbox(
         "Select Voice Preset",
         ['Young Female', 'Mature Female', 'Soft Female', 'Custom']
     if preset_name == 'Custom':
         settings = {
+            'pitch_shift': st.slider("Pitch Shift", 0.0, 6.0, 4.0, 0.5),
+            'harmonic_boost': st.slider("Harmonic Enhancement", 0.0, 0.5, 0.3, 0.05),
+            'breathiness': st.slider("Breathiness", 0.0, 0.3, 0.15, 0.05)
         }
     else:
         settings = create_voice_preset(preset_name)
     if st.button("Convert Voice"):
         with st.spinner("Processing audio..."):
             try:
                 processed_audio, sr = process_audio_advanced(tmp_path, settings)
                 # Save to buffer
                 buffer = BytesIO()
                 sf.write(buffer, processed_audio, sr, format='WAV')
                 st.error(f"Error processing audio: {str(e)}")
 st.markdown("""
 ### Tips for Best Results:
+1. Use high-quality input audio with clear speech
+2. Start with presets and adjust if needed
+3. Keep pitch shift between 3-5 for most natural results
+4. Use minimal breathiness (0.1-0.2) for realistic sound
+5. Record in a quiet environment with minimal background noise
+### Recommended Settings:
+- For younger female voice: pitch shift 4.0, harmonic boost 0.3
+- For mature female voice: pitch shift 3.0, harmonic boost 0.2
+- For soft female voice: pitch shift 3.5, harmonic boost 0.25
 """)