Spaces:

eolang
/

TTS-STT

Running

File size: 6,209 Bytes

import os
import warnings
from transformers.utils import logging as transformers_logging

# Silence all transformers warnings
transformers_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)

import gradio as gr
import torch
from transformers import (
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    pipeline
)
import json
import soundfile as sf
import numpy as np
from huggingface_hub import login
from jiwer import wer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -------------------------------------------------------------------------------------------------------------------

# Authentication $ Env Setup
HF_Key = os.environ.get("HF_Key")
login(token = HF_Key)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Silence all transformers warnings
transformers_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)


# -------------------------------------------------------------------------------------------------------------------

def cosine_sim_wer_single(reference, prediction):
    """
    Calculate a WER-like metric based on cosine similarity for a single reference-prediction pair

    Args:
        reference: Single reference transcript (string)
        prediction: Single model prediction (string)

    Returns:
        Error rate based on cosine similarity (100% - similarity%)
    """
    # Clean inputs
    ref = reference.strip() if reference else ""
    pred = prediction.strip() if prediction else ""

    # Handle empty inputs
    if not ref or not pred:
        print("Warning: Empty reference or prediction")
        return 100.0  # Return 100% error for invalid input

    try:
        # Use character n-grams to handle morphological variations better
        vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 3))

        # Fit and transform
        vectors = vectorizer.fit_transform([ref, pred])

        # Calculate cosine similarity
        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100

        # Convert to error rate (100% - similarity%)
        error_rate = 100.0 - similarity

        print(f"Similarity: {similarity:.2f}%")
        print(f"Error rate: {error_rate:.2f}%")

    except Exception as e:
        print(f"Error calculating similarity: {e}")
        return 100.0  # Return 100% error in case of calculation failure
    
# -------------------------------------------------------------------------------------------------------------------

## TTS Module
speaker_file_path = 'speaker2.json'
model_id = 'eolang/speecht5_v4-2'

with open(speaker_file_path, 'r') as file:
    example = json.load(file)

speaker_embeddings = torch.tensor(example).unsqueeze(0)

l_model = SpeechT5ForTextToSpeech.from_pretrained(
    "eolang/speecht5_v4-2"
)

l_processor = SpeechT5Processor.from_pretrained("eolang/speecht5_v4-2")
l_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

def synthesize(input_text):
    inputs = l_processor(text=input_text, return_tensors="pt")
    speech = l_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=l_vocoder)
    
    # Audio(speech.numpy(), rate=16000)
    sf.write('test_output.wav', speech.numpy(), 16000)
    
    # return speech

# -------------------------------------------------------------------------------------------------------------------

## STT Module
### Custom/Tunned Whisper
tuned_pipeline = pipeline(
    "automatic-speech-recognition",
    model="eolang/whisper-small-sw-WER-13-zindi",
    device = device,
    return_timestamps=True,
    generate_kwargs={
        "no_repeat_ngram_size": 3,  # Blocks repeating 3-grams
        "repetition_penalty": 1.5,  # Penalize repetitions (1.0 = no penalty)
    }
)


def tunned_transcribe(filepath):
    transcription = tuned_pipeline(filepath, return_timestamps=True)
    return transcription["text"]



### OpenAI WHisper (Un-tuned)
openai_pipeline = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    device = device,
    return_timestamps=True,
    generate_kwargs={
        "no_repeat_ngram_size": 3,  # Blocks repeating 3-grams
        "repetition_penalty": 1.5,  # Penalize repetitions (1.0 = no penalty)
    }
)


def openai_transcribe(filepath):
    transcription = openai_pipeline(filepath, return_timestamps=True)
    return transcription["text"]

# -------------------------------------------------------------------------------------------------------------------

## Full Loop module
def full_loop(ref_text):
    # synthesize
    synthesize(ref_text)
    
    # Get transcriptions USING THE WRAPPER FUNCTIONS that return just text
    tunned_transcription = tunned_transcribe('test_output.wav')
    openai_trancsription = openai_transcribe('test_output.wav')
    
    tunned_WER = wer(ref_text, tunned_transcription)
    base_WER = wer(ref_text, openai_trancsription)
    
    result = f'Tunned Model transciption: {tunned_transcription}\n'
    result += f"Word error rate for the tunned model: {round(tunned_WER, 2)}\n"
    
    # Call cosine sim for tuned model (this will print results)
    cosine_sim_wer_single(ref_text, tunned_transcription)
    
    result += f'\nBase Model transciption: {openai_trancsription}\n'
    result += f"Word error rate for base-untunned model: {round(base_WER, 2)}\n"
    
    # Call cosine sim for base model (this will print results)
    cosine_sim_wer_single(ref_text, openai_trancsription)
    
    return 'test_output.wav', result

# -------------------------------------------------------------------------------------------------------------------
# Add minimal Gradio wrapper

# Create a simple Gradio interface
demo = gr.Interface(
    fn=full_loop,  # Use your existing function without modifications
    inputs=gr.Textbox(value="Kuna mambo kadhaa yanayoitajika kuzingatiwa wakati wa kufundisha modeli."),
    outputs=[gr.Audio(), gr.Textbox()],
    title="TTS-STT Evaluation"
)

# Launch the interface
if __name__ == "__main__":
    demo.launch()