File size: 2,727 Bytes
5a9136d
d10f84e
 
bdf330c
 
 
 
c558d1d
bdf330c
51f2ece
5a9136d
 
bdf330c
 
5a9136d
d10f84e
 
 
 
 
 
 
 
bdf330c
 
 
 
918e357
bdf330c
 
 
5a9136d
bdf330c
 
 
 
 
 
 
 
 
 
 
 
5a9136d
918e357
d10f84e
 
918e357
 
d10f84e
 
 
 
bdf330c
 
 
 
 
 
 
d10f84e
bdf330c
 
 
 
5a9136d
 
bdf330c
 
92375a2
 
bdf330c
 
 
 
fb40e69
bdf330c
 
 
5a9136d
 
bdf330c
59eca54
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
import torchaudio
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from pydub import AudioSegment
import os
import gradio as gr

# Load the model and processor
model_id = "hackergeek98/whisper-fa-tinyyy"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to(device)
processor = AutoProcessor.from_pretrained(model_id)

# Create ASR pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=0 if torch.cuda.is_available() else -1,
)

# Convert audio to WAV format
def convert_to_wav(audio_path):
    audio = AudioSegment.from_file(audio_path)
    audio = audio.set_channels(1)  # Ensure mono audio
    wav_path = "converted_audio.wav"
    audio.export(wav_path, format="wav")
    return wav_path

# Split long audio into chunks
def split_audio(audio_path, chunk_length_ms=30000):  # Default: 30 sec per chunk
    audio = AudioSegment.from_wav(audio_path)
    chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    chunk_paths = []
    
    for i, chunk in enumerate(chunks):
        chunk_path = f"chunk_{i}.wav"
        chunk.export(chunk_path, format="wav")
        chunk_paths.append(chunk_path)
    
    return chunk_paths

# **🔹 Fixed: Convert Stereo to Mono Before Processing**
def transcribe_audio_chunk(chunk_path):
    waveform, sampling_rate = torchaudio.load(chunk_path)  # Load audio
    if waveform.shape[0] > 1:  # If stereo (more than 1 channel)
        waveform = torch.mean(waveform, dim=0, keepdim=True)  # Convert to mono
    waveform = waveform.numpy()  # Convert to numpy
    result = pipe({"raw": waveform, "sampling_rate": sampling_rate})  # Pass raw data
    return result["text"]

# Transcribe a long audio file
def transcribe_long_audio(audio_path):
    wav_path = convert_to_wav(audio_path)
    chunk_paths = split_audio(wav_path)
    transcription = ""
    
    for chunk in chunk_paths:
        transcription += transcribe_audio_chunk(chunk) + "\n"
        os.remove(chunk)  # Remove processed chunk
    
    os.remove(wav_path)  # Cleanup original file
    
    return transcription

# Gradio interface
def transcribe_interface(audio_file):
    if not audio_file:
        return "No file uploaded."
    return transcribe_long_audio(audio_file)

iface = gr.Interface(
    fn=transcribe_interface,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Whisper ASR - Transcription",
    description="Upload an audio file, and the model will transcribe it."
)

if __name__ == "__main__":
    iface.launch()