Spaces:

eolang
/

TTS-STT

Running

App Files Files Community

eolang commited on Apr 10

Commit

28aa278

verified ·

1 Parent(s): 8b2fc06

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -0

app.py CHANGED Viewed

	@@ -0,0 +1,187 @@

+import warnings
+# Silence all transformers warnings
+transformers_logging.set_verbosity_error()
+warnings.filterwarnings("ignore", category=UserWarning)
+import gradio as gr
+import torch
+from transformers import (
+    SpeechT5Processor,
+    SpeechT5ForTextToSpeech,
+    SpeechT5HifiGan,
+    pipeline
+)
+import json
+import soundfile as sf
+import numpy as np
+from huggingface_hub import login
+from jiwer import wer
+from transformers.utils import logging as transformers_logging
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import os
+# -------------------------------------------------------------------------------------------------------------------
+# Authentication $ Env Setup
+HF_Key = os.environ.get("HF_Key")
+login(token = HF_Key)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Silence all transformers warnings
+transformers_logging.set_verbosity_error()
+warnings.filterwarnings("ignore", category=UserWarning)
+# -------------------------------------------------------------------------------------------------------------------
+def cosine_sim_wer_single(reference, prediction):
+    """
+    Calculate a WER-like metric based on cosine similarity for a single reference-prediction pair
+    Args:
+        reference: Single reference transcript (string)
+        prediction: Single model prediction (string)
+    Returns:
+        Error rate based on cosine similarity (100% - similarity%)
+    """
+    # Clean inputs
+    ref = reference.strip() if reference else ""
+    pred = prediction.strip() if prediction else ""
+    # Handle empty inputs
+    if not ref or not pred:
+        print("Warning: Empty reference or prediction")
+        return 100.0  # Return 100% error for invalid input
+    try:
+        # Use character n-grams to handle morphological variations better
+        vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 3))
+        # Fit and transform
+        vectors = vectorizer.fit_transform([ref, pred])
+        # Calculate cosine similarity
+        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100
+        # Convert to error rate (100% - similarity%)
+        error_rate = 100.0 - similarity
+        print(f"Similarity: {similarity:.2f}%")
+        print(f"Error rate: {error_rate:.2f}%")
+    except Exception as e:
+        print(f"Error calculating similarity: {e}")
+        return 100.0  # Return 100% error in case of calculation failure
+# -------------------------------------------------------------------------------------------------------------------
+## TTS Module
+speaker_file_path = 'speaker2.json'
+model_id = 'eolang/speecht5_v4-2'
+with open(speaker_file_path, 'r') as file:
+    example = json.load(file)
+speaker_embeddings = torch.tensor(example).unsqueeze(0)
+l_model = SpeechT5ForTextToSpeech.from_pretrained(
+    "eolang/speecht5_v4-2"
+)
+l_processor = SpeechT5Processor.from_pretrained("eolang/speecht5_v4-2")
+l_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+def synthesize(input_text):
+    inputs = l_processor(text=input_text, return_tensors="pt")
+    speech = l_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=l_vocoder)
+    # Audio(speech.numpy(), rate=16000)
+    sf.write('test_output.wav', speech.numpy(), 16000)
+    # return speech
+# -------------------------------------------------------------------------------------------------------------------
+## STT Module
+### Custom/Tunned Whisper
+tuned_pipeline = pipeline(
+    "automatic-speech-recognition",
+    model="eolang/whisper-small-sw-WER-13-zindi",
+    device = device,
+    return_timestamps=True,
+    generate_kwargs={
+        "no_repeat_ngram_size": 3,  # Blocks repeating 3-grams
+        "repetition_penalty": 1.5,  # Penalize repetitions (1.0 = no penalty)
+    }
+)
+def tunned_transcribe(filepath):
+    transcription = tuned_pipeline(filepath, return_timestamps=True)
+    return transcription["text"]
+### OpenAI WHisper (Un-tuned)
+openai_pipeline = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-small",
+    device = device,
+    return_timestamps=True,
+    generate_kwargs={
+        "no_repeat_ngram_size": 3,  # Blocks repeating 3-grams
+        "repetition_penalty": 1.5,  # Penalize repetitions (1.0 = no penalty)
+    }
+)
+def openai_transcribe(filepath):
+    transcription = openai_pipeline(filepath, return_timestamps=True)
+    return transcription["text"]
+# -------------------------------------------------------------------------------------------------------------------
+## Full Loop module
+def full_loop(ref_text):
+    # synthesize
+    synthesize(ref_text)
+    # Get transcriptions USING THE WRAPPER FUNCTIONS that return just text
+    tunned_transcription = tunned_transcribe('test_output.wav')
+    openai_trancsription = openai_transcribe('test_output.wav')
+    tunned_WER = wer(ref_text, tunned_transcription)
+    base_WER = wer(ref_text, openai_trancsription)
+    result = f'Tunned Model transciption: {tunned_transcription}\n'
+    result += f"Word error rate for the tunned model: {round(tunned_WER, 2)}\n"
+    # Call cosine sim for tuned model (this will print results)
+    cosine_sim_wer_single(ref_text, tunned_transcription)
+    result += f'\nBase Model transciption: {openai_trancsription}\n'
+    result += f"Word error rate for base-untunned model: {round(base_WER, 2)}\n"
+    # Call cosine sim for base model (this will print results)
+    cosine_sim_wer_single(ref_text, openai_trancsription)
+    return 'test_output.wav', result
+# -------------------------------------------------------------------------------------------------------------------
+# Add minimal Gradio wrapper
+# Create a simple Gradio interface
+demo = gr.Interface(
+    fn=full_loop,  # Use your existing function without modifications
+    inputs=gr.Textbox(value="Kuna mambo kadhaa yanayoitajika kuzingatiwa wakati wa kufundisha modeli."),
+    outputs=[gr.Audio(), gr.Textbox()],
+    title="TTS-STT Evaluation"
+)
+# Launch the interface
+if __name__ == "__main__":
+    demo.launch()