Spaces:
Sleeping
Sleeping
import numpy as np | |
import librosa | |
import soundfile as sf | |
from datasets import load_dataset | |
from transformers import pipeline | |
# Initialize pipelines for speech recognition and tts models | |
asr = pipeline(task="automatic-speech-recognition", model="distil-whisper/distil-small.en") | |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") | |
# Speech-to-Text Function | |
def transcribe_speech(filepath): | |
if filepath is None: | |
return "No audio found. Please retry." | |
output = asr(filepath) | |
return output["text"] | |
# Long-Form Audio Transcription | |
def transcribe_long_form(filepath): | |
if filepath is None: | |
return "No audio found. Please retry." | |
# Load and preprocess audio | |
audio, sampling_rate = sf.read(filepath) | |
audio_transposed = np.transpose(audio) | |
audio_mono = librosa.to_mono(audio_transposed) | |
audio_16KHz = librosa.resample(audio_mono, orig_sr=sampling_rate, target_sr=16000) | |
# Transcribe using ASR pipeline | |
chunks = asr(audio_16KHz, chunk_length_s=30, batch_size=4, return_timestamps=True)["chunks"] | |
# Combine all transcriptions | |
return "\n".join([chunk["text"] for chunk in chunks]) | |
# Text-to-Speech Function | |
def text_to_speech(text): | |
if not text.strip(): | |
return "No text provided. Please enter text to synthesize." | |
narrated_text = narrator(text) | |
audio_array = narrated_text["audio"][0].flatten() # Flatten the 2D array to 1D | |
sampling_rate = narrated_text["sampling_rate"] # Get sampling rate | |
return sampling_rate, audio_array | |
# Sample Dataset Access Function | |
def get_dataset_sample(idx): | |
dataset = load_dataset("librispeech_asr", split="train.clean.100", streaming=True, trust_remote_code=True) | |
# example = next(iter(dataset)) | |
dataset_head = list(dataset.take(5)) | |
sample = dataset_head[idx] | |
audio_array = sample["audio"]["array"] | |
sampling_rate = sample["audio"]["sampling_rate"] | |
transcription = sample["text"] | |
return (audio_array, sampling_rate), transcription | |