Spaces:
Sleeping
Sleeping
File size: 2,727 Bytes
5a9136d d10f84e bdf330c c558d1d bdf330c 51f2ece 5a9136d bdf330c 5a9136d d10f84e bdf330c 918e357 bdf330c 5a9136d bdf330c 5a9136d 918e357 d10f84e 918e357 d10f84e bdf330c d10f84e bdf330c 5a9136d bdf330c 92375a2 bdf330c fb40e69 bdf330c 5a9136d bdf330c 59eca54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import torch
import torchaudio
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pydub import AudioSegment
import os
import gradio as gr
# Load the model and processor
model_id = "hackergeek98/whisper-fa-tinyyy"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to(device)
processor = AutoProcessor.from_pretrained(model_id)
# Create ASR pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=0 if torch.cuda.is_available() else -1,
)
# Convert audio to WAV format
def convert_to_wav(audio_path):
audio = AudioSegment.from_file(audio_path)
audio = audio.set_channels(1) # Ensure mono audio
wav_path = "converted_audio.wav"
audio.export(wav_path, format="wav")
return wav_path
# Split long audio into chunks
def split_audio(audio_path, chunk_length_ms=30000): # Default: 30 sec per chunk
audio = AudioSegment.from_wav(audio_path)
chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
chunk_paths = []
for i, chunk in enumerate(chunks):
chunk_path = f"chunk_{i}.wav"
chunk.export(chunk_path, format="wav")
chunk_paths.append(chunk_path)
return chunk_paths
# **🔹 Fixed: Convert Stereo to Mono Before Processing**
def transcribe_audio_chunk(chunk_path):
waveform, sampling_rate = torchaudio.load(chunk_path) # Load audio
if waveform.shape[0] > 1: # If stereo (more than 1 channel)
waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to mono
waveform = waveform.numpy() # Convert to numpy
result = pipe({"raw": waveform, "sampling_rate": sampling_rate}) # Pass raw data
return result["text"]
# Transcribe a long audio file
def transcribe_long_audio(audio_path):
wav_path = convert_to_wav(audio_path)
chunk_paths = split_audio(wav_path)
transcription = ""
for chunk in chunk_paths:
transcription += transcribe_audio_chunk(chunk) + "\n"
os.remove(chunk) # Remove processed chunk
os.remove(wav_path) # Cleanup original file
return transcription
# Gradio interface
def transcribe_interface(audio_file):
if not audio_file:
return "No file uploaded."
return transcribe_long_audio(audio_file)
iface = gr.Interface(
fn=transcribe_interface,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Whisper ASR - Transcription",
description="Upload an audio file, and the model will transcribe it."
)
if __name__ == "__main__":
iface.launch()
|